diff --git a/auth.py b/auth.py new file mode 100644 index 0000000..4e9ded9 --- /dev/null +++ b/auth.py @@ -0,0 +1,33 @@ +import os +import json + +AUTH_FILE = 'auth.json' + +def load_auth_data(): + if os.path.exists(AUTH_FILE): + with open(AUTH_FILE, 'r') as file: + return json.load(file) + return {} + +def save_auth_data(data): + with open(AUTH_FILE, 'w') as file: + json.dump(data, file, indent=4) + +def add_credentials(user_id, website, username, password): + data = load_auth_data() + if str(user_id) not in data: + data[str(user_id)] = {} + data[str(user_id)][website] = {'username': username, 'password': password} + save_auth_data(data) + +def get_credentials(user_id): + data = load_auth_data() + return data.get(str(user_id), {}) + +def remove_credentials(user_id, website): + data = load_auth_data() + if str(user_id) in data and website in data[str(user_id)]: + del data[str(user_id)][website] + save_auth_data(data) + return True + return False diff --git a/main.py b/main.py index f448177..ed520d5 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,21 @@ import os import shutil -from web_dl import urlDownloader +import requests from pyrogram import Client, filters -from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton +from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery +from web_dl import urlDownloader +from auth import add_credentials, get_credentials, remove_credentials +# Bot configuration using environment variables BOT_TOKEN = os.environ.get("BOT_TOKEN") API_ID = os.environ.get("API_ID") API_HASH = os.environ.get("API_HASH") Bot = Client( "WebDL-Bot", - bot_token = BOT_TOKEN, - api_id = API_ID, - api_hash = API_HASH + bot_token=BOT_TOKEN, + api_id=API_ID, + api_hash=API_HASH ) START_TXT = """ @@ -20,16 +23,19 @@ I can download all the components (.html, .css, img, xml, video, javascript..) from URLs. -Send any URL, -for ex: 'https://www.google.com' +Send any URL, optionally with the components you want to download. For example: +'https://www.google.com img,css,script' + +Use /auth username:password to add your authentication credentials. +Use /remove_auth to remove your authentication credentials. +Use /view_auth to view your stored authentication credentials. """ START_BTN = InlineKeyboardMarkup( - [[ + [[ InlineKeyboardButton('Source Code', url='https://github.com/samadii/WebDownloaderBot'), - ]] - ) - + ]] +) @Bot.on_message(filters.command(["start"])) async def start(bot, update): @@ -41,33 +47,116 @@ async def start(bot, update): reply_markup=reply_markup ) - - - -@Bot.on_message(filters.private & filters.text & ~filters.regex('/start')) +@Bot.on_message(filters.command(["auth"])) +async def auth(bot, update): + if len(update.command) != 2 or ':' not in update.command[1]: + return await update.reply_text("Please send your username and password in the format 'username:password'") + + username, password = update.command[1].split(":", 1) + add_credentials(update.from_user.id, username, password) + await update.reply_text("Credentials saved successfully.") + +@Bot.on_message(filters.command(["remove_auth"])) +async def remove_auth(bot, update): + success = remove_credentials(update.from_user.id) + if success: + await update.reply_text("Credentials removed successfully.") + else: + await update.reply_text("No credentials found to remove.") + +@Bot.on_message(filters.command(["view_auth"])) +async def view_auth(bot, update): + creds = get_credentials(update.from_user.id) + if creds: + await update.reply_text(f"Your credentials:\nUsername: {creds['username']}\nPassword: {creds['password']}") + else: + await update.reply_text("No credentials found.") + +@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth|/remove_auth|/view_auth')) async def webdl(_, m): + url = m.text.strip() + + if not url.startswith('http'): + return await m.reply("The URL must start with 'http' or 'https'") + + if not is_valid_url(url): + return await m.reply("The URL is invalid or inaccessible") + + # Show buttons for selecting components to download + keyboard = InlineKeyboardMarkup( + [ + [ + InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"), + InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}") + ], + [ + InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"), + InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"), + InlineKeyboardButton("JS", callback_data=f"j|{url[:50]}") + ] + ] + ) + await m.reply("Please select which components to download:", reply_markup=keyboard) - if not m.text.startswith('http'): - return await m.reply("the URL must start with 'http' or 'https'") +@Bot.on_callback_query() +async def callback_query_handler(bot, update: CallbackQuery): + data = update.data + component, url = data.split('|', 1) - msg = await m.reply('Processing..') - url = m.text - name = dir = str(m.chat.id) + imgFlg = component == 'i' + linkFlg = component == 'c' + scriptFlg = component == 'j' + videoFlg = component == 'v' + xmlFlg = component == 'x' + htmlFlg = component == 'h' # Adding HTML flag here + + name = dir = str(update.message.chat.id) if not os.path.isdir(dir): os.makedirs(dir) - obj = urlDownloader(imgFlg=True, linkFlg=True, scriptFlg=True) - res = obj.savePage(url, dir) + auth = get_credentials(update.from_user.id) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, htmlFlg=htmlFlg, file_size_limit=10*1024*1024, auth=auth) + res, summary = obj.savePage(url, dir) if not res: - return await msg.edit_text('something went wrong!') + return await update.message.reply('Something went wrong!') + zip_filename = f"{name}.zip" shutil.make_archive(name, 'zip', base_dir=dir) - await m.reply_document(name+'.zip') - await msg.delete() - - shutil.rmtree(dir) - os.remove(name+'.zip') - + try: + await update.message.reply_document(zip_filename, caption=summary) + except Exception as e: + print(f"Failed to send document: {e}") + + try: + shutil.rmtree(dir) + except Exception as e: + print(f"Failed to remove directory {dir}: {e}") + + try: + os.remove(zip_filename) + except Exception as e: + print(f"Failed to remove zip file {zip_filename}: {e}") + + print("Download completed successfully!") # Debug statement + +def is_valid_url(url): + try: + headers = {"User-Agent": "Mozilla/5.0"} + response = requests.head(url, headers=headers, timeout=10, allow_redirects=True) + if response.status_code == 200: + return True + print(f"HEAD request failed with status code: {response.status_code}") + print(f"Response headers: {response.headers}") + # Fallback to GET request if HEAD fails + response = requests.get(url, headers=headers, timeout=10, allow_redirects=True) + if response.status_code == 200: + return True + print(f"GET request failed with status code: {response.status_code}") + print(f"Response headers: {response.headers}") + except requests.RequestException as e: + print(f"Request exception: {e}") + return False Bot.run() diff --git a/requirements.txt b/requirements.txt index dc831f5..cf1fab6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ requests lxml urllib3 bs4 +tqdm diff --git a/web_dl.py b/web_dl.py index 37a9e3e..81445d8 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,69 +1,99 @@ -import os, sys +import os import re +import sys import requests -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin from bs4 import BeautifulSoup +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor - -#----------------------------------------------------------------------------- -#----------------------------------------------------------------------------- class urlDownloader(object): - """ Download the webpage components base on the input url.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True): + """Download the webpage components based on the input URL.""" + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, htmlFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg - self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml','js') + self.videoFlg = videoFlg + self.xmlFlg = xmlFlg + self.htmlFlg = htmlFlg + self.file_size_limit = file_size_limit + self.max_retries = max_retries + self.auth = auth + self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') + self.videoType = ('mp4', 'webm', 'ogg') + self.summary = { + 'images': 0, + 'links': 0, + 'scripts': 0, + 'videos': 0, + 'xmls': 0, + 'htmls': 0 + } self.session = requests.Session() - - #----------------------------------------------------------------------------- + def savePage(self, url, pagefolder='page'): - """ Save the web page components based on the input url and dir name. - Args: - url ([try]): web url string. - pagefolder (str, optional): path to save the web components. - Returns: - [bool]: whether the components saved the successfully. - """ + """Save the web page components based on the input URL and dir name.""" try: - response = self.session.get(url) + response = self.session.get(url, auth=self.auth) + response.raise_for_status() self.soup = BeautifulSoup(response.text, features="lxml") - if not os.path.exists(pagefolder): os.mkdir(pagefolder) - if self.imgFlg: self._soupfindnSave(url, pagefolder, tag2find='img', inner='src') - if self.linkFlg: self._soupfindnSave(url, pagefolder, tag2find='link', inner='href') - if self.scriptFlg: self._soupfindnSave(url, pagefolder, tag2find='script', inner='src') - with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: - file.write(self.soup.prettify('utf-8')) - return True + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + if self.imgFlg: + self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images') + if self.linkFlg: + self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') + if self.scriptFlg: + self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') + if self.videoFlg: + self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos') + if self.xmlFlg: + self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') + if self.htmlFlg: + self._soupfindnSave(url, pagefolder, tag2find='html', inner='src', category='htmls') + with open(os.path.join(pagefolder, 'index.html'), 'w', encoding='utf-8') as f: + f.write(self.soup.prettify()) + summary_text = "\n".join([f"{k}: {v}" for k, v in self.summary.items()]) + return True, summary_text except Exception as e: - print("> savePage(): Create files failed: %s." % str(e)) - return False + print(f"Error saving page: {e}") + return False, "" - #----------------------------------------------------------------------------- - def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): - """ Saves on specified pagefolder all tag2find objects. """ - pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): os.mkdir(pagefolder) - for res in self.soup.findAll(tag2find): # images, css, etc.. - try: - if not res.has_attr(inner): continue # check if inner tag (file object) exists - # clean special chars such as '@, # ? <>' - filename = re.sub('\W+', '.', os.path.basename(res[inner])) - # print("> filename:", filename) - # Added the '.html' for the html file in the href - if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): - filename += '.html' - fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(pagefolder, filename) - # rename html ref so can move html and folder of files anywhere - res[inner] = os.path.join(os.path.basename(pagefolder), filename) - # create the file. - if not os.path.isfile(filepath): - with open(filepath, 'wb') as file: - filebin = self.session.get(fileurl) - if len(filebin.content) > 0: # filter the empty file(imge not found) - file.write(filebin.content) - except Exception as exc: - print(exc, file=sys.stderr) + def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): + """Find and save specific elements in the soup.""" + tags = self.soup.find_all(tag2find) + print(f"Found {len(tags)} {category} tags") # Debug statement + urls = [tag.get(inner) for tag in tags] + urls = [urljoin(url, u) for u in urls] + urls = list(set(urls)) + self.summary[category] += len(urls) + folder = os.path.join(pagefolder, category) + if not os.path.exists(folder): + os.mkdir(folder) + with ThreadPoolExecutor(max_workers=10) as executor: + for u in urls: + executor.submit(self._savefile, folder, u) + def _savefile(self, folder, fileurl): + """Save the file content from the URL to the given folder.""" + if not fileurl: + return + name = re.sub(r'\W+', '', os.path.basename(fileurl)) + filename = os.path.join(folder, name) + print(f"Downloading {fileurl} to {filename}") # Debug statement + try: + response = self.session.get(fileurl, stream=True, auth=self.auth) + response.raise_for_status() + content_length = response.headers.get('Content-Length') + if content_length and self.file_size_limit and int(content_length) > self.file_size_limit: + print(f"Skipping {fileurl}, file size {content_length} exceeds limit {self.file_size_limit}") + return + with open(filename, 'wb') as f: + for chunk in tqdm(response.iter_content(chunk_size=1024)): + if chunk: + f.write(chunk) + except Exception as e: + print(f"Error downloading {fileurl}: {e}") + if os.path.exists(filename): + os.remove(filename)