From 86031e12a3543f9e2cabfea162d349fab7387224 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Fri, 24 May 2024 20:30:53 +0530 Subject: [PATCH 01/66] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index f448177..ffa0441 100644 --- a/main.py +++ b/main.py @@ -26,7 +26,7 @@ START_BTN = InlineKeyboardMarkup( [[ - InlineKeyboardButton('Source Code', url='https://github.com/samadii/WebDownloaderBot'), + InlineKeyboardButton('My Father', url='https://t.me/Matiz_Owner'), ]] ) From a944b101dfbe96de2b383c86ffc44da01d91f8ca Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Fri, 24 May 2024 20:37:08 +0530 Subject: [PATCH 02/66] Update web_dl.py --- web_dl.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/web_dl.py b/web_dl.py index 37a9e3e..722bb60 100644 --- a/web_dl.py +++ b/web_dl.py @@ -5,8 +5,6 @@ from bs4 import BeautifulSoup -#----------------------------------------------------------------------------- -#----------------------------------------------------------------------------- class urlDownloader(object): """ Download the webpage components base on the input url.""" def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True): @@ -14,25 +12,28 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True): self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg - self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml','js') + self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') self.session = requests.Session() - #----------------------------------------------------------------------------- def savePage(self, url, pagefolder='page'): """ Save the web page components based on the input url and dir name. Args: url ([try]): web url string. pagefolder (str, optional): path to save the web components. Returns: - [bool]: whether the components saved the successfully. + [bool]: whether the components saved successfully. """ try: response = self.session.get(url) self.soup = BeautifulSoup(response.text, features="lxml") - if not os.path.exists(pagefolder): os.mkdir(pagefolder) - if self.imgFlg: self._soupfindnSave(url, pagefolder, tag2find='img', inner='src') - if self.linkFlg: self._soupfindnSave(url, pagefolder, tag2find='link', inner='href') - if self.scriptFlg: self._soupfindnSave(url, pagefolder, tag2find='script', inner='src') + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + if self.imgFlg: + self._soupfindnSave(url, pagefolder, tag2find='img', inner='src') + if self.linkFlg: + self._soupfindnSave(url, pagefolder, tag2find='link', inner='href') + if self.scriptFlg: + self._soupfindnSave(url, pagefolder, tag2find='script', inner='src') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) return True @@ -40,16 +41,17 @@ def savePage(self, url, pagefolder='page'): print("> savePage(): Create files failed: %s." % str(e)) return False - #----------------------------------------------------------------------------- def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): """ Saves on specified pagefolder all tag2find objects. """ pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): os.mkdir(pagefolder) - for res in self.soup.findAll(tag2find): # images, css, etc.. + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + for res in self.soup.findAll(tag2find): # images, css, etc.. try: - if not res.has_attr(inner): continue # check if inner tag (file object) exists + if not res.has_attr(inner): + continue # check if inner tag (file object) exists # clean special chars such as '@, # ? <>' - filename = re.sub('\W+', '.', os.path.basename(res[inner])) + filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) # print("> filename:", filename) # Added the '.html' for the html file in the href if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): @@ -62,8 +64,7 @@ def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): if not os.path.isfile(filepath): with open(filepath, 'wb') as file: filebin = self.session.get(fileurl) - if len(filebin.content) > 0: # filter the empty file(imge not found) + if len(filebin.content) > 0: # filter the empty file(imge not found) file.write(filebin.content) except Exception as exc: print(exc, file=sys.stderr) - From 7ac4154408b526d4aff12b39058a492d5b2b78fb Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Fri, 24 May 2024 20:44:36 +0530 Subject: [PATCH 03/66] Update web_dl.py --- web_dl.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/web_dl.py b/web_dl.py index 722bb60..939a033 100644 --- a/web_dl.py +++ b/web_dl.py @@ -4,9 +4,8 @@ from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup - class urlDownloader(object): - """ Download the webpage components base on the input url.""" + """ Download the webpage components based on the input URL.""" def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True): self.soup = None self.imgFlg = imgFlg @@ -16,12 +15,12 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True): self.session = requests.Session() def savePage(self, url, pagefolder='page'): - """ Save the web page components based on the input url and dir name. + """ Save the web page components based on the input URL and dir name. Args: - url ([try]): web url string. + url (str): web URL string. pagefolder (str, optional): path to save the web components. Returns: - [bool]: whether the components saved successfully. + bool: whether the components saved successfully. """ try: response = self.session.get(url) @@ -38,7 +37,7 @@ def savePage(self, url, pagefolder='page'): file.write(self.soup.prettify('utf-8')) return True except Exception as e: - print("> savePage(): Create files failed: %s." % str(e)) + print(f"> savePage(): Create files failed: {str(e)}.") return False def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): @@ -64,7 +63,7 @@ def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): if not os.path.isfile(filepath): with open(filepath, 'wb') as file: filebin = self.session.get(fileurl) - if len(filebin.content) > 0: # filter the empty file(imge not found) + if len(filebin.content) > 0: # filter the empty file (image not found) file.write(filebin.content) except Exception as exc: print(exc, file=sys.stderr) From 9c0ae8d1fb874ea1ff58fbbe7c478f16d8b09f78 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:01:07 +0530 Subject: [PATCH 04/66] Update web_dl.py --- web_dl.py | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/web_dl.py b/web_dl.py index 939a033..b5dbdfd 100644 --- a/web_dl.py +++ b/web_dl.py @@ -3,6 +3,7 @@ import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup +from tqdm import tqdm class urlDownloader(object): """ Download the webpage components based on the input URL.""" @@ -41,29 +42,29 @@ def savePage(self, url, pagefolder='page'): return False def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): - """ Saves on specified pagefolder all tag2find objects. """ - pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) - for res in self.soup.findAll(tag2find): # images, css, etc.. - try: - if not res.has_attr(inner): - continue # check if inner tag (file object) exists - # clean special chars such as '@, # ? <>' - filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) - # print("> filename:", filename) - # Added the '.html' for the html file in the href - if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): - filename += '.html' - fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(pagefolder, filename) - # rename html ref so can move html and folder of files anywhere - res[inner] = os.path.join(os.path.basename(pagefolder), filename) - # create the file. - if not os.path.isfile(filepath): - with open(filepath, 'wb') as file: - filebin = self.session.get(fileurl) - if len(filebin.content) > 0: # filter the empty file (image not found) - file.write(filebin.content) - except Exception as exc: - print(exc, file=sys.stderr) + """ Saves on specified pagefolder all tag2find objects. """ + pagefolder = os.path.join(pagefolder, tag2find) + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + elements = self.soup.findAll(tag2find) + for res in tqdm(elements, desc=f"Downloading {tag2find}"): + try: + if not res.has_attr(inner): + continue # check if inner tag (file object) exists + # clean special chars such as '@, # ? <>' + filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) + # Added the '.html' for the html file in the href + if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): + filename += '.html' + fileurl = urljoin(url, res.get(inner)) + filepath = os.path.join(pagefolder, filename) + # rename html ref so can move html and folder of files anywhere + res[inner] = os.path.join(os.path.basename(pagefolder), filename) + # create the file. + if not os.path.isfile(filepath): + with open(filepath, 'wb') as file: + filebin = self.session.get(fileurl) + if len(filebin.content) > 0: # filter the empty file (image not found) + file.write(filebin.content) + except Exception as exc: + print(exc, file=sys.stderr) From 3775d7051293cda9e1f1f784ec85ddb95f48986c Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:01:27 +0530 Subject: [PATCH 05/66] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index dc831f5..cf1fab6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ requests lxml urllib3 bs4 +tqdm From 498e3bb0eceac9a9d0ac781a4ba21876d3015971 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:04:31 +0530 Subject: [PATCH 06/66] Update web_dl.py --- web_dl.py | 55 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/web_dl.py b/web_dl.py index b5dbdfd..74f2418 100644 --- a/web_dl.py +++ b/web_dl.py @@ -3,7 +3,8 @@ import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup -from tqdm import tqdm +from tqdm import tqdm # For progress indicator + class urlDownloader(object): """ Download the webpage components based on the input URL.""" @@ -42,29 +43,29 @@ def savePage(self, url, pagefolder='page'): return False def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): - """ Saves on specified pagefolder all tag2find objects. """ - pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) - elements = self.soup.findAll(tag2find) - for res in tqdm(elements, desc=f"Downloading {tag2find}"): - try: - if not res.has_attr(inner): - continue # check if inner tag (file object) exists - # clean special chars such as '@, # ? <>' - filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) - # Added the '.html' for the html file in the href - if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): - filename += '.html' - fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(pagefolder, filename) - # rename html ref so can move html and folder of files anywhere - res[inner] = os.path.join(os.path.basename(pagefolder), filename) - # create the file. - if not os.path.isfile(filepath): - with open(filepath, 'wb') as file: - filebin = self.session.get(fileurl) - if len(filebin.content) > 0: # filter the empty file (image not found) - file.write(filebin.content) - except Exception as exc: - print(exc, file=sys.stderr) + """ Saves on specified pagefolder all tag2find objects. """ + pagefolder = os.path.join(pagefolder, tag2find) + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + elements = self.soup.findAll(tag2find) + for res in tqdm(elements, desc=f"Downloading {tag2find}"): + try: + if not res.has_attr(inner): + continue # check if inner tag (file object) exists + # clean special chars such as '@, # ? <>' + filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) + # Added the '.html' for the html file in the href + if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): + filename += '.html' + fileurl = urljoin(url, res.get(inner)) + filepath = os.path.join(pagefolder, filename) + # rename html ref so can move html and folder of files anywhere + res[inner] = os.path.join(os.path.basename(pagefolder), filename) + # create the file. + if not os.path.isfile(filepath): + with open(filepath, 'wb') as file: + filebin = self.session.get(fileurl) + if len(filebin.content) > 0: # filter the empty file (image not found) + file.write(filebin.content) + except Exception as exc: + print(exc, file=sys.stderr) From 654276b5caed52adca3669a39ea3f9823bb70817 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:15:39 +0530 Subject: [PATCH 07/66] Update requirements.txt From 5cee0c3d503940fbdce9c54c305b5ca982145384 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:16:32 +0530 Subject: [PATCH 08/66] Update web_dl.py --- web_dl.py | 84 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/web_dl.py b/web_dl.py index 74f2418..b73bebb 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,71 +1,87 @@ -import os, sys +import os import re +import sys import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup -from tqdm import tqdm # For progress indicator - +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor class urlDownloader(object): - """ Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True): + """Download the webpage components based on the input URL.""" + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg + self.file_size_limit = file_size_limit + self.max_retries = max_retries self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') self.session = requests.Session() - + self.summary = { + 'images': 0, + 'links': 0, + 'scripts': 0 + } + def savePage(self, url, pagefolder='page'): - """ Save the web page components based on the input URL and dir name. - Args: - url (str): web URL string. - pagefolder (str, optional): path to save the web components. - Returns: - bool: whether the components saved successfully. - """ + """Save the web page components based on the input URL and dir name.""" try: response = self.session.get(url) self.soup = BeautifulSoup(response.text, features="lxml") if not os.path.exists(pagefolder): os.mkdir(pagefolder) if self.imgFlg: - self._soupfindnSave(url, pagefolder, tag2find='img', inner='src') + self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images') if self.linkFlg: - self._soupfindnSave(url, pagefolder, tag2find='link', inner='href') + self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') if self.scriptFlg: - self._soupfindnSave(url, pagefolder, tag2find='script', inner='src') + self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) - return True + summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." + return True, summary except Exception as e: - print(f"> savePage(): Create files failed: {str(e)}.") - return False + print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) + return False, None - def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'): - """ Saves on specified pagefolder all tag2find objects. """ + def _download_file(self, fileurl, filepath): + """Download a file with retry mechanism.""" + for attempt in range(self.max_retries): + try: + filebin = self.session.get(fileurl, stream=True) + filebin.raise_for_status() + if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: + print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) + return False + with open(filepath, 'wb') as file: + for chunk in filebin.iter_content(chunk_size=8192): + if chunk: + file.write(chunk) + return True + except requests.RequestException as exc: + print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) + return False + + def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): + """Saves on specified pagefolder all tag2find objects.""" pagefolder = os.path.join(pagefolder, tag2find) if not os.path.exists(pagefolder): os.mkdir(pagefolder) elements = self.soup.findAll(tag2find) - for res in tqdm(elements, desc=f"Downloading {tag2find}"): - try: + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for res in tqdm(elements, desc=f"Downloading {tag2find}"): if not res.has_attr(inner): - continue # check if inner tag (file object) exists - # clean special chars such as '@, # ? <>' + continue filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) - # Added the '.html' for the html file in the href if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): filename += '.html' fileurl = urljoin(url, res.get(inner)) filepath = os.path.join(pagefolder, filename) - # rename html ref so can move html and folder of files anywhere res[inner] = os.path.join(os.path.basename(pagefolder), filename) - # create the file. if not os.path.isfile(filepath): - with open(filepath, 'wb') as file: - filebin = self.session.get(fileurl) - if len(filebin.content) > 0: # filter the empty file (image not found) - file.write(filebin.content) - except Exception as exc: - print(exc, file=sys.stderr) + futures.append(executor.submit(self._download_file, fileurl, filepath)) + for future in futures: + if future.result(): + self.summary[category] += 1 From fa0e3d9520a0237efe9ab54c5e4843d4f18b503a Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:17:39 +0530 Subject: [PATCH 09/66] Update main.py --- main.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/main.py b/main.py index ffa0441..e0e13ff 100644 --- a/main.py +++ b/main.py @@ -10,9 +10,9 @@ Bot = Client( "WebDL-Bot", - bot_token = BOT_TOKEN, - api_id = API_ID, - api_hash = API_HASH + bot_token=BOT_TOKEN, + api_id=API_ID, + api_hash=API_HASH ) START_TXT = """ @@ -25,11 +25,10 @@ """ START_BTN = InlineKeyboardMarkup( - [[ - InlineKeyboardButton('My Father', url='https://t.me/Matiz_Owner'), - ]] - ) - + [[ + InlineKeyboardButton('Source Code', url='https://github.com/samadii/WebDownloaderBot'), + ]] +) @Bot.on_message(filters.command(["start"])) async def start(bot, update): @@ -41,33 +40,27 @@ async def start(bot, update): reply_markup=reply_markup ) - - - @Bot.on_message(filters.private & filters.text & ~filters.regex('/start')) async def webdl(_, m): - if not m.text.startswith('http'): - return await m.reply("the URL must start with 'http' or 'https'") + return await m.reply("The URL must start with 'http' or 'https'") - msg = await m.reply('Processing..') + msg = await m.reply('Processing...') url = m.text name = dir = str(m.chat.id) if not os.path.isdir(dir): os.makedirs(dir) - obj = urlDownloader(imgFlg=True, linkFlg=True, scriptFlg=True) - res = obj.savePage(url, dir) + obj = urlDownloader(imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=10*1024*1024) + res, summary = obj.savePage(url, dir) if not res: - return await msg.edit_text('something went wrong!') + return await msg.edit_text('Something went wrong!') shutil.make_archive(name, 'zip', base_dir=dir) - await m.reply_document(name+'.zip') + await m.reply_document(name+'.zip', caption=summary) await msg.delete() shutil.rmtree(dir) os.remove(name+'.zip') - - Bot.run() From 1f4f57b82187a82c3032755a4b376d4ceaa15a04 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:29:12 +0530 Subject: [PATCH 10/66] Update main.py --- main.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index e0e13ff..7d227fd 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,10 @@ import os import shutil -from web_dl import urlDownloader +import requests from pyrogram import Client, filters from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton +from web_dl import urlDownloader +import asyncio BOT_TOKEN = os.environ.get("BOT_TOKEN") API_ID = os.environ.get("API_ID") @@ -20,8 +22,8 @@ I can download all the components (.html, .css, img, xml, video, javascript..) from URLs. -Send any URL, -for ex: 'https://www.google.com' +Send any URL, optionally with the components you want to download. For example: +'https://www.google.com img,css,script' """ START_BTN = InlineKeyboardMarkup( @@ -40,18 +42,45 @@ async def start(bot, update): reply_markup=reply_markup ) +def parse_components(text): + components = text.split()[1:] + imgFlg = 'img' in components + linkFlg = 'css' in components + scriptFlg = 'script' in components + return imgFlg, linkFlg, scriptFlg + +def is_valid_url(url): + try: + response = requests.head(url, timeout=5) + return response.status_code == 200 + except requests.RequestException: + return False + +async def send_progress(msg, chat_id, initial_text): + for i in range(10): + await asyncio.sleep(1) + await Bot.edit_message_text(chat_id=chat_id, message_id=msg.message_id, text=f"{initial_text}\nProgress: {i*10}%") + @Bot.on_message(filters.private & filters.text & ~filters.regex('/start')) async def webdl(_, m): - if not m.text.startswith('http'): + parts = m.text.split() + url = parts[0] + + if not url.startswith('http'): return await m.reply("The URL must start with 'http' or 'https'") + if not is_valid_url(url): + return await m.reply("The URL is invalid or inaccessible") + msg = await m.reply('Processing...') - url = m.text + asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) + + imgFlg, linkFlg, scriptFlg = parse_components(m.text) name = dir = str(m.chat.id) if not os.path.isdir(dir): os.makedirs(dir) - obj = urlDownloader(imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=10*1024*1024) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, file_size_limit=10*1024*1024) res, summary = obj.savePage(url, dir) if not res: return await msg.edit_text('Something went wrong!') From c7d76a61d4713713550c45a48b00a6633053003f Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:31:49 +0530 Subject: [PATCH 11/66] Update web_dl.py --- web_dl.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/web_dl.py b/web_dl.py index b73bebb..77c486b 100644 --- a/web_dl.py +++ b/web_dl.py @@ -2,20 +2,21 @@ import re import sys import requests -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin from bs4 import BeautifulSoup from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor class urlDownloader(object): """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3): + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg self.file_size_limit = file_size_limit self.max_retries = max_retries + self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') self.session = requests.Session() self.summary = { @@ -27,7 +28,7 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=No def savePage(self, url, pagefolder='page'): """Save the web page components based on the input URL and dir name.""" try: - response = self.session.get(url) + response = self.session.get(url, auth=self.auth) self.soup = BeautifulSoup(response.text, features="lxml") if not os.path.exists(pagefolder): os.mkdir(pagefolder) @@ -49,7 +50,7 @@ def _download_file(self, fileurl, filepath): """Download a file with retry mechanism.""" for attempt in range(self.max_retries): try: - filebin = self.session.get(fileurl, stream=True) + filebin = self.session.get(fileurl, stream=True, auth=self.auth) filebin.raise_for_status() if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) From 59ebbfcfafd22196983b7236adb711aa12721da6 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:33:21 +0530 Subject: [PATCH 12/66] Update main.py --- main.py | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index 7d227fd..1dfaa57 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ from pyrogram import Client, filters from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton from web_dl import urlDownloader +from auth import add_credentials, get_credentials, remove_credentials import asyncio BOT_TOKEN = os.environ.get("BOT_TOKEN") @@ -24,6 +25,8 @@ Send any URL, optionally with the components you want to download. For example: 'https://www.google.com img,css,script' + +Use /auth to add your authentication credentials. """ START_BTN = InlineKeyboardMarkup( @@ -42,26 +45,11 @@ async def start(bot, update): reply_markup=reply_markup ) -def parse_components(text): - components = text.split()[1:] - imgFlg = 'img' in components - linkFlg = 'css' in components - scriptFlg = 'script' in components - return imgFlg, linkFlg, scriptFlg - -def is_valid_url(url): - try: - response = requests.head(url, timeout=5) - return response.status_code == 200 - except requests.RequestException: - return False - -async def send_progress(msg, chat_id, initial_text): - for i in range(10): - await asyncio.sleep(1) - await Bot.edit_message_text(chat_id=chat_id, message_id=msg.message_id, text=f"{initial_text}\nProgress: {i*10}%") - -@Bot.on_message(filters.private & filters.text & ~filters.regex('/start')) +@Bot.on_message(filters.command(["auth"])) +async def auth(bot, update): + await update.reply_text("Please send your username and password in the format 'username:password'") + +@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth')) async def webdl(_, m): parts = m.text.split() url = parts[0] @@ -72,6 +60,16 @@ async def webdl(_, m): if not is_valid_url(url): return await m.reply("The URL is invalid or inaccessible") + # Check if the user is sending authentication details + if ":" in m.text and m.text.count(":") == 1: + username, password = m.text.split(":") + add_credentials(m.chat.id, username, password) + return await m.reply("Credentials saved successfully.") + + # Check if user has credentials saved + credentials = get_credentials(m.chat.id) + auth = (credentials['username'], credentials['password']) if credentials else None + msg = await m.reply('Processing...') asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) @@ -80,7 +78,7 @@ async def webdl(_, m): if not os.path.isdir(dir): os.makedirs(dir) - obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, file_size_limit=10*1024*1024) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: return await msg.edit_text('Something went wrong!') From d216f9bf907b5bd2bc04519152cde1bed72d9e66 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:34:21 +0530 Subject: [PATCH 13/66] Create auth.py --- auth.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 auth.py diff --git a/auth.py b/auth.py new file mode 100644 index 0000000..a4ea522 --- /dev/null +++ b/auth.py @@ -0,0 +1,31 @@ +# auth.py + +import os +import json + +AUTH_FILE = 'auth.json' + +def load_auth_data(): + if os.path.exists(AUTH_FILE): + with open(AUTH_FILE, 'r') as file: + return json.load(file) + return {} + +def save_auth_data(data): + with open(AUTH_FILE, 'w') as file: + json.dump(data, file, indent=4) + +def add_credentials(user_id, username, password): + data = load_auth_data() + data[user_id] = {'username': username, 'password': password} + save_auth_data(data) + +def get_credentials(user_id): + data = load_auth_data() + return data.get(str(user_id)) + +def remove_credentials(user_id): + data = load_auth_data() + if str(user_id) in data: + del data[str(user_id)] + save_auth_data(data) From 025cf5dfacdecd39894c3b0a1a328835e93d7ec8 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:38:22 +0530 Subject: [PATCH 14/66] Update main.py --- main.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/main.py b/main.py index 1dfaa57..3355b71 100644 --- a/main.py +++ b/main.py @@ -90,4 +90,23 @@ async def webdl(_, m): shutil.rmtree(dir) os.remove(name+'.zip') +def is_valid_url(url): + try: + response = requests.head(url, timeout=5) + return response.status_code == 200 + except requests.RequestException: + return False + +def parse_components(text): + components = text.split()[1:] + imgFlg = 'img' in components + linkFlg = 'css' in components + scriptFlg = 'script' in components + return imgFlg, linkFlg, scriptFlg + +async def send_progress(msg, chat_id, initial_text): + for i in range(10): + await asyncio.sleep(1) + await Bot.edit_message_text(chat_id=chat_id, message_id=msg.message_id, text=f"{initial_text}\nProgress: {i*10}%") + Bot.run() From bbefcaf5ba05ca243dadf16eaa565522ee648835 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 06:43:13 +0530 Subject: [PATCH 15/66] Update main.py --- main.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 3355b71..20bb2a5 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ from pyrogram import Client, filters from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton from web_dl import urlDownloader -from auth import add_credentials, get_credentials, remove_credentials +from auth import add_credentials, get_credentials import asyncio BOT_TOKEN = os.environ.get("BOT_TOKEN") @@ -26,7 +26,7 @@ Send any URL, optionally with the components you want to download. For example: 'https://www.google.com img,css,script' -Use /auth to add your authentication credentials. +Use /auth username:password to add your authentication credentials. """ START_BTN = InlineKeyboardMarkup( @@ -47,7 +47,12 @@ async def start(bot, update): @Bot.on_message(filters.command(["auth"])) async def auth(bot, update): - await update.reply_text("Please send your username and password in the format 'username:password'") + if len(update.command) != 2 or ':' not in update.command[1]: + return await update.reply_text("Please send your username and password in the format 'username:password'") + + username, password = update.command[1].split(":", 1) + add_credentials(update.from_user.id, username, password) + await update.reply_text("Credentials saved successfully.") @Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth')) async def webdl(_, m): @@ -60,12 +65,6 @@ async def webdl(_, m): if not is_valid_url(url): return await m.reply("The URL is invalid or inaccessible") - # Check if the user is sending authentication details - if ":" in m.text and m.text.count(":") == 1: - username, password = m.text.split(":") - add_credentials(m.chat.id, username, password) - return await m.reply("Credentials saved successfully.") - # Check if user has credentials saved credentials = get_credentials(m.chat.id) auth = (credentials['username'], credentials['password']) if credentials else None From 89e6de7018ecb973a3a6560c55fd0837a6bf26a4 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 07:14:07 +0530 Subject: [PATCH 16/66] Update main.py From 8dff9179084b0ecc086c8dc8714da1d430bd432c Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 07:16:39 +0530 Subject: [PATCH 17/66] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 20bb2a5..5cbfa00 100644 --- a/main.py +++ b/main.py @@ -106,6 +106,6 @@ def parse_components(text): async def send_progress(msg, chat_id, initial_text): for i in range(10): await asyncio.sleep(1) - await Bot.edit_message_text(chat_id=chat_id, message_id=msg.message_id, text=f"{initial_text}\nProgress: {i*10}%") + await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") Bot.run() From 8844de0ef708eed8a3021a7803d043e0ef7c51db Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 07:19:03 +0530 Subject: [PATCH 18/66] Update main.py --- main.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 5cbfa00..16e7d00 100644 --- a/main.py +++ b/main.py @@ -104,8 +104,11 @@ def parse_components(text): return imgFlg, linkFlg, scriptFlg async def send_progress(msg, chat_id, initial_text): - for i in range(10): - await asyncio.sleep(1) - await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") + try: + for i in range(10): + await asyncio.sleep(1) + await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") + except Exception as e: + print(f"Error updating progress: {e}") Bot.run() From 5c671bf6a5eda8443e8da4eb03a4b342a51ec3cb Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:28:54 +0530 Subject: [PATCH 19/66] Update main.py --- main.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 16e7d00..31bd7ef 100644 --- a/main.py +++ b/main.py @@ -70,7 +70,7 @@ async def webdl(_, m): auth = (credentials['username'], credentials['password']) if credentials else None msg = await m.reply('Processing...') - asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) + await send_progress(msg, m.chat.id, "Processing...") imgFlg, linkFlg, scriptFlg = parse_components(m.text) name = dir = str(m.chat.id) @@ -107,8 +107,12 @@ async def send_progress(msg, chat_id, initial_text): try: for i in range(10): await asyncio.sleep(1) - await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") + try: + await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") + except Exception as e: + print(f"Error updating progress: {e}", file=sys.stderr) + break except Exception as e: - print(f"Error updating progress: {e}") + print(f"Error in send_progress loop: {e}", file=sys.stderr) Bot.run() From 1e5201558c8183e18993535161b588709d26e2d0 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:34:10 +0530 Subject: [PATCH 20/66] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 31bd7ef..998526e 100644 --- a/main.py +++ b/main.py @@ -70,7 +70,7 @@ async def webdl(_, m): auth = (credentials['username'], credentials['password']) if credentials else None msg = await m.reply('Processing...') - await send_progress(msg, m.chat.id, "Processing...") + asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) imgFlg, linkFlg, scriptFlg = parse_components(m.text) name = dir = str(m.chat.id) From e7fbd278878da586669c3a92d19ba869e6a4901f Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:34:35 +0530 Subject: [PATCH 21/66] Update web_dl.py --- web_dl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web_dl.py b/web_dl.py index 77c486b..e4c4da1 100644 --- a/web_dl.py +++ b/web_dl.py @@ -29,6 +29,7 @@ def savePage(self, url, pagefolder='page'): """Save the web page components based on the input URL and dir name.""" try: response = self.session.get(url, auth=self.auth) + response.raise_for_status() self.soup = BeautifulSoup(response.text, features="lxml") if not os.path.exists(pagefolder): os.mkdir(pagefolder) @@ -82,6 +83,7 @@ def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category= filepath = os.path.join(pagefolder, filename) res[inner] = os.path.join(os.path.basename(pagefolder), filename) if not os.path.isfile(filepath): + print(f"Downloading {fileurl} to {filepath}") # Debug statement futures.append(executor.submit(self._download_file, fileurl, filepath)) for future in futures: if future.result(): From b6d509b923965169430323df79e647ac331ad7c0 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:37:15 +0530 Subject: [PATCH 22/66] Update main.py --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 998526e..e91cf91 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import os +import sys # Add this import statement import shutil import requests from pyrogram import Client, filters From 7afa77ffbc59c38e2d8ceb0bb362b30f143b8d64 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:38:20 +0530 Subject: [PATCH 23/66] Update web_dl.py --- web_dl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web_dl.py b/web_dl.py index e4c4da1..2212fc1 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,6 +1,6 @@ import os import re -import sys +import sys # Make sure to import sys here as well if needed import requests from urllib.parse import urljoin from bs4 import BeautifulSoup @@ -60,6 +60,7 @@ def _download_file(self, fileurl, filepath): for chunk in filebin.iter_content(chunk_size=8192): if chunk: file.write(chunk) + print(f"Successfully downloaded {fileurl} to {filepath}") # Debug statement return True except requests.RequestException as exc: print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) From ca9b36398437c17908a3d0adb1643aa1707a822f Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:41:13 +0530 Subject: [PATCH 24/66] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index e91cf91..d1280b7 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ import os -import sys # Add this import statement +import sys import shutil import requests from pyrogram import Client, filters From 7ce16ebc3610304b5b61cb05857deb3570ff9690 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:42:14 +0530 Subject: [PATCH 25/66] Update web_dl.py --- web_dl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_dl.py b/web_dl.py index 2212fc1..08f872e 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,6 +1,6 @@ import os import re -import sys # Make sure to import sys here as well if needed +import sys import requests from urllib.parse import urljoin from bs4 import BeautifulSoup From df858ba91f52312c3c06a33e7dbb72c243da0720 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 08:58:46 +0530 Subject: [PATCH 26/66] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index d1280b7..ed6e493 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ import os -import sys +import sys # Ensure sys is imported import shutil import requests from pyrogram import Client, filters From df90085f43beecae48d774398736919f9fe68ce9 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 09:02:55 +0530 Subject: [PATCH 27/66] Update web_dl.py --- web_dl.py | 68 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/web_dl.py b/web_dl.py index 08f872e..263a815 100644 --- a/web_dl.py +++ b/web_dl.py @@ -2,15 +2,17 @@ import re import sys import requests -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor +from typing import Tuple, Optional, Dict -class urlDownloader(object): +class urlDownloader: """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3, auth=None): - self.soup = None + + def __init__(self, imgFlg: bool = True, linkFlg: bool = True, scriptFlg: bool = True, file_size_limit: Optional[int] = None, max_retries: int = 3, auth: Optional[Tuple[str, str]] = None): + self.soup: Optional[BeautifulSoup] = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg @@ -19,43 +21,48 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=No self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') self.session = requests.Session() - self.summary = { + self.summary: Dict[str, int] = { 'images': 0, 'links': 0, 'scripts': 0 } - def savePage(self, url, pagefolder='page'): + def savePage(self, url: str, pagefolder: str = 'page') -> Tuple[bool, Optional[str]]: """Save the web page components based on the input URL and dir name.""" try: response = self.session.get(url, auth=self.auth) response.raise_for_status() - self.soup = BeautifulSoup(response.text, features="lxml") - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) + self.soup = BeautifulSoup(response.text, 'lxml') + os.makedirs(pagefolder, exist_ok=True) + if self.imgFlg: - self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images') + self._soupfindnSave(url, pagefolder, 'img', 'src', 'images') if self.linkFlg: - self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') + self._soupfindnSave(url, pagefolder, 'link', 'href', 'links') if self.scriptFlg: - self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') + self._soupfindnSave(url, pagefolder, 'script', 'src', 'scripts') + with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) + summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) return False, None - def _download_file(self, fileurl, filepath): + def _download_file(self, fileurl: str, filepath: str) -> bool: """Download a file with retry mechanism.""" for attempt in range(self.max_retries): try: filebin = self.session.get(fileurl, stream=True, auth=self.auth) filebin.raise_for_status() - if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: - print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) + + file_size = int(filebin.headers.get('content-length', 0)) + if self.file_size_limit and file_size > self.file_size_limit: + print(f"File {fileurl} exceeds the size limit of {self.file_size_limit} bytes.", file=sys.stderr) return False + with open(filepath, 'wb') as file: for chunk in filebin.iter_content(chunk_size=8192): if chunk: @@ -66,26 +73,37 @@ def _download_file(self, fileurl, filepath): print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) return False - def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): - """Saves on specified pagefolder all tag2find objects.""" - pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) - elements = self.soup.findAll(tag2find) + def _soupfindnSave(self, url: str, pagefolder: str, tag2find: str, inner: str, category: str) -> None: + """Saves specified tag objects in the given folder.""" + folder_path = os.path.join(pagefolder, tag2find) + os.makedirs(folder_path, exist_ok=True) + + elements = self.soup.find_all(tag2find) with ThreadPoolExecutor(max_workers=4) as executor: futures = [] for res in tqdm(elements, desc=f"Downloading {tag2find}"): if not res.has_attr(inner): continue - filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) - if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): + + filename = self._sanitize_filename(res.get(inner)) + if tag2find == 'link' and not any(ext in filename for ext in self.linkType): filename += '.html' + fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(pagefolder, filename) - res[inner] = os.path.join(os.path.basename(pagefolder), filename) + filepath = os.path.join(folder_path, filename) + + res[inner] = os.path.join(os.path.basename(folder_path), filename) + if not os.path.isfile(filepath): print(f"Downloading {fileurl} to {filepath}") # Debug statement futures.append(executor.submit(self._download_file, fileurl, filepath)) + for future in futures: if future.result(): self.summary[category] += 1 + + def _sanitize_filename(self, url: str) -> str: + """Sanitize the filename extracted from the URL.""" + parsed_url = urlparse(url) + filename = os.path.basename(parsed_url.path) + return re.sub(r'\W+', '.', filename) From b9bb481520d3d27d4d3dbd74b6743efeb89440b3 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 09:03:18 +0530 Subject: [PATCH 28/66] Update main.py From d8dc8a48de2a108f44af2eec5d194b374e89b47e Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 09:31:53 +0530 Subject: [PATCH 29/66] Update main.py --- main.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index ed6e493..8240d94 100644 --- a/main.py +++ b/main.py @@ -71,7 +71,7 @@ async def webdl(_, m): auth = (credentials['username'], credentials['password']) if credentials else None msg = await m.reply('Processing...') - asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) + progress_task = asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) imgFlg, linkFlg, scriptFlg = parse_components(m.text) name = dir = str(m.chat.id) @@ -81,7 +81,9 @@ async def webdl(_, m): obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: - return await msg.edit_text('Something went wrong!') + await msg.edit_text('Something went wrong!') + progress_task.cancel() + return shutil.make_archive(name, 'zip', base_dir=dir) await m.reply_document(name+'.zip', caption=summary) @@ -89,6 +91,7 @@ async def webdl(_, m): shutil.rmtree(dir) os.remove(name+'.zip') + progress_task.cancel() def is_valid_url(url): try: @@ -111,8 +114,11 @@ async def send_progress(msg, chat_id, initial_text): try: await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") except Exception as e: + if "MESSAGE_ID_INVALID" in str(e): + print(f"Message ID invalid: {e}", file=sys.stderr) + break print(f"Error updating progress: {e}", file=sys.stderr) - break + continue except Exception as e: print(f"Error in send_progress loop: {e}", file=sys.stderr) From 8bf8a4018ea4262acc8a9cb05e14facaf7ea66e0 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 09:37:52 +0530 Subject: [PATCH 30/66] Update main.py --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 8240d94..cd8ea55 100644 --- a/main.py +++ b/main.py @@ -74,6 +74,7 @@ async def webdl(_, m): progress_task = asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) imgFlg, linkFlg, scriptFlg = parse_components(m.text) + print(f"Flags - img: {imgFlg}, link: {linkFlg}, script: {scriptFlg}") # Debug statement name = dir = str(m.chat.id) if not os.path.isdir(dir): os.makedirs(dir) From 8eeaa057c1c1b223f73d9a79f9951dbd22ee03e7 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 09:38:45 +0530 Subject: [PATCH 31/66] Update web_dl.py --- web_dl.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/web_dl.py b/web_dl.py index 263a815..67423a5 100644 --- a/web_dl.py +++ b/web_dl.py @@ -35,17 +35,23 @@ def savePage(self, url: str, pagefolder: str = 'page') -> Tuple[bool, Optional[s self.soup = BeautifulSoup(response.text, 'lxml') os.makedirs(pagefolder, exist_ok=True) + print(f"Starting to download components from {url}") + if self.imgFlg: + print("Downloading images...") self._soupfindnSave(url, pagefolder, 'img', 'src', 'images') if self.linkFlg: + print("Downloading links...") self._soupfindnSave(url, pagefolder, 'link', 'href', 'links') if self.scriptFlg: + print("Downloading scripts...") self._soupfindnSave(url, pagefolder, 'script', 'src', 'scripts') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." + print(summary) return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) From 2b8ec6721680fc4d2ec55f170d13f1f6a16d136c Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:03:28 +0530 Subject: [PATCH 32/66] Update main.py --- main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index cd8ea55..2fa2eb7 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ import os -import sys # Ensure sys is imported +import sys import shutil import requests from pyrogram import Client, filters @@ -94,13 +94,6 @@ async def webdl(_, m): os.remove(name+'.zip') progress_task.cancel() -def is_valid_url(url): - try: - response = requests.head(url, timeout=5) - return response.status_code == 200 - except requests.RequestException: - return False - def parse_components(text): components = text.split()[1:] imgFlg = 'img' in components @@ -108,6 +101,13 @@ def parse_components(text): scriptFlg = 'script' in components return imgFlg, linkFlg, scriptFlg +def is_valid_url(url): + try: + response = requests.head(url, timeout=5) + return response.status_code == 200 + except requests.RequestException: + return False + async def send_progress(msg, chat_id, initial_text): try: for i in range(10): From 87ac1013a592395928d9b1d1754e82c0619d9950 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:07:38 +0530 Subject: [PATCH 33/66] Update web_dl.py --- web_dl.py | 77 ++++++++++++++++++++----------------------------------- 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/web_dl.py b/web_dl.py index 67423a5..77578fd 100644 --- a/web_dl.py +++ b/web_dl.py @@ -2,17 +2,15 @@ import re import sys import requests -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin from bs4 import BeautifulSoup from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor -from typing import Tuple, Optional, Dict -class urlDownloader: +class urlDownloader(object): """Download the webpage components based on the input URL.""" - - def __init__(self, imgFlg: bool = True, linkFlg: bool = True, scriptFlg: bool = True, file_size_limit: Optional[int] = None, max_retries: int = 3, auth: Optional[Tuple[str, str]] = None): - self.soup: Optional[BeautifulSoup] = None + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3, auth=None): + self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg @@ -21,54 +19,43 @@ def __init__(self, imgFlg: bool = True, linkFlg: bool = True, scriptFlg: bool = self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') self.session = requests.Session() - self.summary: Dict[str, int] = { + self.summary = { 'images': 0, 'links': 0, 'scripts': 0 } - def savePage(self, url: str, pagefolder: str = 'page') -> Tuple[bool, Optional[str]]: + def savePage(self, url, pagefolder='page'): """Save the web page components based on the input URL and dir name.""" try: response = self.session.get(url, auth=self.auth) response.raise_for_status() - self.soup = BeautifulSoup(response.text, 'lxml') - os.makedirs(pagefolder, exist_ok=True) - - print(f"Starting to download components from {url}") - + self.soup = BeautifulSoup(response.text, features="lxml") + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) if self.imgFlg: - print("Downloading images...") - self._soupfindnSave(url, pagefolder, 'img', 'src', 'images') + self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images') if self.linkFlg: - print("Downloading links...") - self._soupfindnSave(url, pagefolder, 'link', 'href', 'links') + self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') if self.scriptFlg: - print("Downloading scripts...") - self._soupfindnSave(url, pagefolder, 'script', 'src', 'scripts') - + self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) - summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." - print(summary) return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) return False, None - def _download_file(self, fileurl: str, filepath: str) -> bool: + def _download_file(self, fileurl, filepath): """Download a file with retry mechanism.""" for attempt in range(self.max_retries): try: filebin = self.session.get(fileurl, stream=True, auth=self.auth) filebin.raise_for_status() - - file_size = int(filebin.headers.get('content-length', 0)) - if self.file_size_limit and file_size > self.file_size_limit: - print(f"File {fileurl} exceeds the size limit of {self.file_size_limit} bytes.", file=sys.stderr) + if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: + print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) return False - with open(filepath, 'wb') as file: for chunk in filebin.iter_content(chunk_size=8192): if chunk: @@ -79,37 +66,29 @@ def _download_file(self, fileurl: str, filepath: str) -> bool: print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) return False - def _soupfindnSave(self, url: str, pagefolder: str, tag2find: str, inner: str, category: str) -> None: - """Saves specified tag objects in the given folder.""" - folder_path = os.path.join(pagefolder, tag2find) - os.makedirs(folder_path, exist_ok=True) - - elements = self.soup.find_all(tag2find) + def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): + """Saves on specified pagefolder all tag2find objects.""" + pagefolder = os.path.join(pagefolder, tag2find) + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + elements = self.soup.findAll(tag2find) + if not elements: + print(f"No {tag2find} elements found.", file=sys.stderr) with ThreadPoolExecutor(max_workers=4) as executor: futures = [] for res in tqdm(elements, desc=f"Downloading {tag2find}"): if not res.has_attr(inner): continue - - filename = self._sanitize_filename(res.get(inner)) - if tag2find == 'link' and not any(ext in filename for ext in self.linkType): + filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) + if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): filename += '.html' - fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(folder_path, filename) - - res[inner] = os.path.join(os.path.basename(folder_path), filename) - + filepath = os.path.join(pagefolder, filename) + res[inner] = os.path.join(os.path.basename(pagefolder), filename) if not os.path.isfile(filepath): print(f"Downloading {fileurl} to {filepath}") # Debug statement futures.append(executor.submit(self._download_file, fileurl, filepath)) - for future in futures: if future.result(): self.summary[category] += 1 - - def _sanitize_filename(self, url: str) -> str: - """Sanitize the filename extracted from the URL.""" - parsed_url = urlparse(url) - filename = os.path.basename(parsed_url.path) - return re.sub(r'\W+', '.', filename) + print(f"Completed downloading {tag2find} elements. Total: {self.summary[category]}") # Debug statement From a4aa730fe75ce426662ec4ef131385ee107b7907 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:14:01 +0530 Subject: [PATCH 34/66] Update main.py --- main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 2fa2eb7..b19b3fa 100644 --- a/main.py +++ b/main.py @@ -95,10 +95,11 @@ async def webdl(_, m): progress_task.cancel() def parse_components(text): - components = text.split()[1:] + components = text.split()[1:] # Skip the URL part imgFlg = 'img' in components linkFlg = 'css' in components scriptFlg = 'script' in components + print(f"Parsed components: imgFlg={imgFlg}, linkFlg={linkFlg}, scriptFlg={scriptFlg}") # Debug statement return imgFlg, linkFlg, scriptFlg def is_valid_url(url): From 0e3df0cf0a819ca21fc442dd4464955284734718 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:17:37 +0530 Subject: [PATCH 35/66] Update main.py From a5d3f2f3a067d906ede819ed6c0a59c5cd73e1ea Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:22:31 +0530 Subject: [PATCH 36/66] Update main.py --- main.py | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/main.py b/main.py index b19b3fa..b30ab23 100644 --- a/main.py +++ b/main.py @@ -59,6 +59,8 @@ async def auth(bot, update): async def webdl(_, m): parts = m.text.split() url = parts[0] + components = parts[1:] # Extract components from the message + download_directly = False if not url.startswith('http'): return await m.reply("The URL must start with 'http' or 'https'") @@ -66,15 +68,29 @@ async def webdl(_, m): if not is_valid_url(url): return await m.reply("The URL is invalid or inaccessible") - # Check if user has credentials saved - credentials = get_credentials(m.chat.id) - auth = (credentials['username'], credentials['password']) if credentials else None - - msg = await m.reply('Processing...') - progress_task = asyncio.create_task(send_progress(msg, m.chat.id, "Processing...")) + # Check if components are specified in the message + if components: + imgFlg, linkFlg, scriptFlg = parse_components(' '.join(components)) + print(f"Flags - img: {imgFlg}, link: {linkFlg}, script: {scriptFlg}") # Debug statement + else: + # No components specified, prompt user with options + keyboard = InlineKeyboardMarkup( + [ + [ + InlineKeyboardButton("HTML", callback_data="html"), + InlineKeyboardButton("CSS", callback_data="css"), + InlineKeyboardButton("Images", callback_data="images") + ], + [ + InlineKeyboardButton("XML", callback_data="xml"), + InlineKeyboardButton("Video", callback_data="video"), + InlineKeyboardButton("JavaScript", callback_data="js") + ] + ] + ) + await m.reply("Please select which components to download:", reply_markup=keyboard) + return - imgFlg, linkFlg, scriptFlg = parse_components(m.text) - print(f"Flags - img: {imgFlg}, link: {linkFlg}, script: {scriptFlg}") # Debug statement name = dir = str(m.chat.id) if not os.path.isdir(dir): os.makedirs(dir) @@ -82,24 +98,21 @@ async def webdl(_, m): obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: - await msg.edit_text('Something went wrong!') - progress_task.cancel() - return + return await m.reply('Something went wrong!') shutil.make_archive(name, 'zip', base_dir=dir) await m.reply_document(name+'.zip', caption=summary) - await msg.delete() shutil.rmtree(dir) os.remove(name+'.zip') - progress_task.cancel() + + print("Download completed successfully!") # Debug statement def parse_components(text): - components = text.split()[1:] # Skip the URL part + components = text.split() imgFlg = 'img' in components linkFlg = 'css' in components scriptFlg = 'script' in components - print(f"Parsed components: imgFlg={imgFlg}, linkFlg={linkFlg}, scriptFlg={scriptFlg}") # Debug statement return imgFlg, linkFlg, scriptFlg def is_valid_url(url): From dc1de97c9a6fa69ce01af1778c717576f1af4ea4 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:55:21 +0530 Subject: [PATCH 37/66] Update main.py --- main.py | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/main.py b/main.py index b30ab23..d54bd23 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,100 @@ +# web_dl.py +import os +import re +import sys +import requests +from urllib.parse import urljoin +from bs4 import BeautifulSoup +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor + +class urlDownloader(object): + """Download the webpage components based on the input URL.""" + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3, auth=None): + self.soup = None + self.imgFlg = imgFlg + self.linkFlg = linkFlg + self.scriptFlg = scriptFlg + self.file_size_limit = file_size_limit + self.max_retries = max_retries + self.auth = auth + self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') + self.session = requests.Session() + self.summary = { + 'images': 0, + 'links': 0, + 'scripts': 0 + } + + def savePage(self, url, pagefolder='page'): + """Save the web page components based on the input URL and dir name.""" + try: + response = self.session.get(url, auth=self.auth) + response.raise_for_status() + self.soup = BeautifulSoup(response.text, features="lxml") + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + if self.imgFlg: + self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images') + if self.linkFlg: + self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') + if self.scriptFlg: + self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') + with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: + file.write(self.soup.prettify('utf-8')) + summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." + return True, summary + except Exception as e: + print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) + return False, None + + def _download_file(self, fileurl, filepath): + """Download a file with retry mechanism.""" + for attempt in range(self.max_retries): + try: + filebin = self.session.get(fileurl, stream=True, auth=self.auth) + filebin.raise_for_status() + if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: + print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) + return False + with open(filepath, 'wb') as file: + for chunk in filebin.iter_content(chunk_size=8192): + if chunk: + file.write(chunk) + print(f"Successfully downloaded {fileurl} to {filepath}") # Debug statement + return True + except requests.RequestException as exc: + print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) + return False + + def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): + """Saves on specified pagefolder all tag2find objects.""" + pagefolder = os.path.join(pagefolder, tag2find) + if not os.path.exists(pagefolder): + os.mkdir(pagefolder) + elements = self.soup.findAll(tag2find) + if not elements: + print(f"No {tag2find} elements found.", file=sys.stderr) + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [] + for res in tqdm(elements, desc=f"Downloading {tag2find}"): + if not res.has_attr(inner): + continue + filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) + if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): + filename += '.html' + fileurl = urljoin(url, res.get(inner)) + filepath = os.path.join(pagefolder, filename) + res[inner] = os.path.join(os.path.basename(pagefolder), filename) + if not os.path.isfile(filepath): + print(f"Downloading {fileurl} to {filepath}") # Debug statement + futures.append(executor.submit(self._download_file, fileurl, filepath)) + for future in futures: + if future.result(): + self.summary[category] += 1 + print(f"Completed downloading {tag2find} elements. Total: {self.summary[category]}") # Debug statement + +# main.py import os import sys import shutil @@ -8,6 +105,7 @@ from auth import add_credentials, get_credentials import asyncio +# Bot configuration using environment variables BOT_TOKEN = os.environ.get("BOT_TOKEN") API_ID = os.environ.get("API_ID") API_HASH = os.environ.get("API_HASH") From 630622a3026669efe70d338ee6930cb1b35deb6e Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:57:55 +0530 Subject: [PATCH 38/66] Update main.py --- main.py | 161 ++++++++++++-------------------------------------------- 1 file changed, 35 insertions(+), 126 deletions(-) diff --git a/main.py b/main.py index d54bd23..2d4a0ac 100644 --- a/main.py +++ b/main.py @@ -1,106 +1,10 @@ -# web_dl.py -import os -import re -import sys -import requests -from urllib.parse import urljoin -from bs4 import BeautifulSoup -from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor - -class urlDownloader(object): - """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3, auth=None): - self.soup = None - self.imgFlg = imgFlg - self.linkFlg = linkFlg - self.scriptFlg = scriptFlg - self.file_size_limit = file_size_limit - self.max_retries = max_retries - self.auth = auth - self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') - self.session = requests.Session() - self.summary = { - 'images': 0, - 'links': 0, - 'scripts': 0 - } - - def savePage(self, url, pagefolder='page'): - """Save the web page components based on the input URL and dir name.""" - try: - response = self.session.get(url, auth=self.auth) - response.raise_for_status() - self.soup = BeautifulSoup(response.text, features="lxml") - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) - if self.imgFlg: - self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images') - if self.linkFlg: - self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') - if self.scriptFlg: - self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') - with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: - file.write(self.soup.prettify('utf-8')) - summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." - return True, summary - except Exception as e: - print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) - return False, None - - def _download_file(self, fileurl, filepath): - """Download a file with retry mechanism.""" - for attempt in range(self.max_retries): - try: - filebin = self.session.get(fileurl, stream=True, auth=self.auth) - filebin.raise_for_status() - if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: - print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) - return False - with open(filepath, 'wb') as file: - for chunk in filebin.iter_content(chunk_size=8192): - if chunk: - file.write(chunk) - print(f"Successfully downloaded {fileurl} to {filepath}") # Debug statement - return True - except requests.RequestException as exc: - print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) - return False - - def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): - """Saves on specified pagefolder all tag2find objects.""" - pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) - elements = self.soup.findAll(tag2find) - if not elements: - print(f"No {tag2find} elements found.", file=sys.stderr) - with ThreadPoolExecutor(max_workers=4) as executor: - futures = [] - for res in tqdm(elements, desc=f"Downloading {tag2find}"): - if not res.has_attr(inner): - continue - filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) - if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): - filename += '.html' - fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(pagefolder, filename) - res[inner] = os.path.join(os.path.basename(pagefolder), filename) - if not os.path.isfile(filepath): - print(f"Downloading {fileurl} to {filepath}") # Debug statement - futures.append(executor.submit(self._download_file, fileurl, filepath)) - for future in futures: - if future.result(): - self.summary[category] += 1 - print(f"Completed downloading {tag2find} elements. Total: {self.summary[category]}") # Debug statement - # main.py import os import sys import shutil import requests from pyrogram import Client, filters -from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton +from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery from web_dl import urlDownloader from auth import add_credentials, get_credentials import asyncio @@ -155,10 +59,7 @@ async def auth(bot, update): @Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth')) async def webdl(_, m): - parts = m.text.split() - url = parts[0] - components = parts[1:] # Extract components from the message - download_directly = False + url = m.text.strip() if not url.startswith('http'): return await m.reply("The URL must start with 'http' or 'https'") @@ -166,40 +67,46 @@ async def webdl(_, m): if not is_valid_url(url): return await m.reply("The URL is invalid or inaccessible") - # Check if components are specified in the message - if components: - imgFlg, linkFlg, scriptFlg = parse_components(' '.join(components)) - print(f"Flags - img: {imgFlg}, link: {linkFlg}, script: {scriptFlg}") # Debug statement - else: - # No components specified, prompt user with options - keyboard = InlineKeyboardMarkup( + # Show buttons for selecting components to download + keyboard = InlineKeyboardMarkup( + [ + [ + InlineKeyboardButton("HTML", callback_data=f"html|{url}"), + InlineKeyboardButton("CSS", callback_data=f"css|{url}"), + InlineKeyboardButton("Images", callback_data=f"img|{url}") + ], [ - [ - InlineKeyboardButton("HTML", callback_data="html"), - InlineKeyboardButton("CSS", callback_data="css"), - InlineKeyboardButton("Images", callback_data="images") - ], - [ - InlineKeyboardButton("XML", callback_data="xml"), - InlineKeyboardButton("Video", callback_data="video"), - InlineKeyboardButton("JavaScript", callback_data="js") - ] + InlineKeyboardButton("XML", callback_data=f"xml|{url}"), + InlineKeyboardButton("Video", callback_data=f"video|{url}"), + InlineKeyboardButton("JavaScript", callback_data=f"script|{url}") ] - ) - await m.reply("Please select which components to download:", reply_markup=keyboard) - return + ] + ) + await m.reply("Please select which components to download:", reply_markup=keyboard) + +@Bot.on_callback_query() +async def callback_query_handler(bot, update: CallbackQuery): + data = update.data.split("|") + component, url = data[0], data[1] + + imgFlg = component == 'img' + linkFlg = component == 'css' + scriptFlg = component == 'script' + videoFlg = component == 'video' + xmlFlg = component == 'xml' - name = dir = str(m.chat.id) + name = dir = str(update.message.chat.id) if not os.path.isdir(dir): os.makedirs(dir) - obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, file_size_limit=10*1024*1024, auth=auth) + auth = get_credentials(update.from_user.id) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: - return await m.reply('Something went wrong!') + return await update.message.reply('Something went wrong!') shutil.make_archive(name, 'zip', base_dir=dir) - await m.reply_document(name+'.zip', caption=summary) + await update.message.reply_document(name+'.zip', caption=summary) shutil.rmtree(dir) os.remove(name+'.zip') @@ -211,7 +118,9 @@ def parse_components(text): imgFlg = 'img' in components linkFlg = 'css' in components scriptFlg = 'script' in components - return imgFlg, linkFlg, scriptFlg + videoFlg = 'video' in components + xmlFlg = 'xml' in components + return imgFlg, linkFlg, scriptFlg, videoFlg, xmlFlg def is_valid_url(url): try: From 69337d1ddedf2c3bb605b4707502519ea4a23db0 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 10:58:25 +0530 Subject: [PATCH 39/66] Update web_dl.py --- web_dl.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/web_dl.py b/web_dl.py index 77578fd..019709f 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,3 +1,4 @@ +# web_dl.py import os import re import sys @@ -9,20 +10,25 @@ class urlDownloader(object): """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, file_size_limit=None, max_retries=3, auth=None): + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg + self.videoFlg = videoFlg + self.xmlFlg = xmlFlg self.file_size_limit = file_size_limit self.max_retries = max_retries self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') + self.videoType = ('mp4', 'webm', 'ogg') self.session = requests.Session() self.summary = { 'images': 0, 'links': 0, - 'scripts': 0 + 'scripts': 0, + 'videos': 0, + 'xmls': 0 } def savePage(self, url, pagefolder='page'): @@ -39,9 +45,13 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links') if self.scriptFlg: self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts') + if self.videoFlg: + self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos') + if self.xmlFlg: + self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) - summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts." + summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls." return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) From efe5cd85f868db12c21b2f5304f38a586610461c Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:02:35 +0530 Subject: [PATCH 40/66] Update web_dl.py --- web_dl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web_dl.py b/web_dl.py index 019709f..e1cfefa 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,4 +1,3 @@ -# web_dl.py import os import re import sys @@ -51,7 +50,8 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) - summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls." + summary = (f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, " + f"{self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls.") return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) From 3ad64808e4bb786f565bc3f3f9fdea6f989d5518 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:04:17 +0530 Subject: [PATCH 41/66] Update main.py --- main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 2d4a0ac..953db6a 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,3 @@ -# main.py import os import sys import shutil @@ -86,8 +85,8 @@ async def webdl(_, m): @Bot.on_callback_query() async def callback_query_handler(bot, update: CallbackQuery): - data = update.data.split("|") - component, url = data[0], data[1] + data = update.data + component, url = data.split('|', 1) imgFlg = component == 'img' linkFlg = component == 'css' From 67c0ea311073f2f08c98722ff9c35cab97a62eec Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:07:35 +0530 Subject: [PATCH 42/66] Update main.py --- main.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 953db6a..ec94e87 100644 --- a/main.py +++ b/main.py @@ -70,14 +70,14 @@ async def webdl(_, m): keyboard = InlineKeyboardMarkup( [ [ - InlineKeyboardButton("HTML", callback_data=f"html|{url}"), - InlineKeyboardButton("CSS", callback_data=f"css|{url}"), - InlineKeyboardButton("Images", callback_data=f"img|{url}") + InlineKeyboardButton("HTML", callback_data=f"h|{url}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url}"), + InlineKeyboardButton("Images", callback_data=f"i|{url}") ], [ - InlineKeyboardButton("XML", callback_data=f"xml|{url}"), - InlineKeyboardButton("Video", callback_data=f"video|{url}"), - InlineKeyboardButton("JavaScript", callback_data=f"script|{url}") + InlineKeyboardButton("XML", callback_data=f"x|{url}"), + InlineKeyboardButton("Video", callback_data=f"v|{url}"), + InlineKeyboardButton("JavaScript", callback_data=f"j|{url}") ] ] ) @@ -88,11 +88,11 @@ async def callback_query_handler(bot, update: CallbackQuery): data = update.data component, url = data.split('|', 1) - imgFlg = component == 'img' - linkFlg = component == 'css' - scriptFlg = component == 'script' - videoFlg = component == 'video' - xmlFlg = component == 'xml' + imgFlg = component == 'i' + linkFlg = component == 'c' + scriptFlg = component == 'j' + videoFlg = component == 'v' + xmlFlg = component == 'x' name = dir = str(update.message.chat.id) if not os.path.isdir(dir): From 32a0f9be5fea48d88ebabdb1a1c35daf88a85172 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:10:35 +0530 Subject: [PATCH 43/66] Update main.py --- main.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index ec94e87..6b2553a 100644 --- a/main.py +++ b/main.py @@ -70,14 +70,14 @@ async def webdl(_, m): keyboard = InlineKeyboardMarkup( [ [ - InlineKeyboardButton("HTML", callback_data=f"h|{url}"), - InlineKeyboardButton("CSS", callback_data=f"c|{url}"), - InlineKeyboardButton("Images", callback_data=f"i|{url}") + InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"), + InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}") ], [ - InlineKeyboardButton("XML", callback_data=f"x|{url}"), - InlineKeyboardButton("Video", callback_data=f"v|{url}"), - InlineKeyboardButton("JavaScript", callback_data=f"j|{url}") + InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"), + InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"), + InlineKeyboardButton("JavaScript", callback_data=f"j|{url[:50]}") ] ] ) @@ -86,7 +86,10 @@ async def webdl(_, m): @Bot.on_callback_query() async def callback_query_handler(bot, update: CallbackQuery): data = update.data - component, url = data.split('|', 1) + component, url_fragment = data.split('|', 1) + + # Reconstruct the URL + url = update.message.reply_to_message.text.split()[0] imgFlg = component == 'i' linkFlg = component == 'c' From 7fdf7eb35afb5511cd7686af76c7b54b9959ea88 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:13:28 +0530 Subject: [PATCH 44/66] Update main.py --- main.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index 6b2553a..ec94e87 100644 --- a/main.py +++ b/main.py @@ -70,14 +70,14 @@ async def webdl(_, m): keyboard = InlineKeyboardMarkup( [ [ - InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"), - InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"), - InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}") + InlineKeyboardButton("HTML", callback_data=f"h|{url}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url}"), + InlineKeyboardButton("Images", callback_data=f"i|{url}") ], [ - InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"), - InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"), - InlineKeyboardButton("JavaScript", callback_data=f"j|{url[:50]}") + InlineKeyboardButton("XML", callback_data=f"x|{url}"), + InlineKeyboardButton("Video", callback_data=f"v|{url}"), + InlineKeyboardButton("JavaScript", callback_data=f"j|{url}") ] ] ) @@ -86,10 +86,7 @@ async def webdl(_, m): @Bot.on_callback_query() async def callback_query_handler(bot, update: CallbackQuery): data = update.data - component, url_fragment = data.split('|', 1) - - # Reconstruct the URL - url = update.message.reply_to_message.text.split()[0] + component, url = data.split('|', 1) imgFlg = component == 'i' linkFlg = component == 'c' From 0ecca274a20e0f3998ddee33179cf118456b2109 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:26:30 +0530 Subject: [PATCH 45/66] Update web_dl.py --- web_dl.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/web_dl.py b/web_dl.py index e1cfefa..0e53aa7 100644 --- a/web_dl.py +++ b/web_dl.py @@ -9,7 +9,7 @@ class urlDownloader(object): """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=False, xmlFlg=False, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg @@ -20,7 +20,6 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xml self.max_retries = max_retries self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') - self.videoType = ('mp4', 'webm', 'ogg') self.session = requests.Session() self.summary = { 'images': 0, @@ -50,8 +49,7 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) - summary = (f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, " - f"{self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls.") + summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls." return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) From 3bb448747f5d96bee81eaab5fa0feee38b7e1358 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:28:29 +0530 Subject: [PATCH 46/66] Update main.py From 26ed465b55ee4d1d2acdd8328be1403b5f6222c2 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:33:21 +0530 Subject: [PATCH 47/66] Update web_dl.py --- web_dl.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/web_dl.py b/web_dl.py index 0e53aa7..731d76a 100644 --- a/web_dl.py +++ b/web_dl.py @@ -9,7 +9,7 @@ class urlDownloader(object): """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=False, xmlFlg=False, file_size_limit=None, max_retries=3, auth=None): + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg @@ -20,6 +20,7 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=False, xm self.max_retries = max_retries self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') + self.videoType = ('mp4', 'webm', 'ogg') self.session = requests.Session() self.summary = { 'images': 0, @@ -49,8 +50,8 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: file.write(self.soup.prettify('utf-8')) - summary = f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, {self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls." - return True, summary + summary = (f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, " + f"{self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls.") except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) return False, None From 6d5e14f529f0fc254eb6b2dff2b826dcddaa444f Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:37:59 +0530 Subject: [PATCH 48/66] Update web_dl.py --- web_dl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web_dl.py b/web_dl.py index 731d76a..e1cfefa 100644 --- a/web_dl.py +++ b/web_dl.py @@ -52,6 +52,7 @@ def savePage(self, url, pagefolder='page'): file.write(self.soup.prettify('utf-8')) summary = (f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, " f"{self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls.") + return True, summary except Exception as e: print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) return False, None From 12e655cb589ec43a862f50017425cafec78f0356 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:42:42 +0530 Subject: [PATCH 49/66] Update main.py --- main.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index ec94e87..8038486 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ from pyrogram import Client, filters from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery from web_dl import urlDownloader -from auth import add_credentials, get_credentials +from auth import add_credentials, get_credentials, remove_credentials, get_all_credentials import asyncio # Bot configuration using environment variables @@ -29,6 +29,8 @@ 'https://www.google.com img,css,script' Use /auth username:password to add your authentication credentials. +Use /remove_auth to remove your authentication credentials. +Use /view_auth to view your stored authentication credentials. """ START_BTN = InlineKeyboardMarkup( @@ -56,7 +58,23 @@ async def auth(bot, update): add_credentials(update.from_user.id, username, password) await update.reply_text("Credentials saved successfully.") -@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth')) +@Bot.on_message(filters.command(["remove_auth"])) +async def remove_auth(bot, update): + success = remove_credentials(update.from_user.id) + if success: + await update.reply_text("Credentials removed successfully.") + else: + await update.reply_text("No credentials found to remove.") + +@Bot.on_message(filters.command(["view_auth"])) +async def view_auth(bot, update): + creds = get_credentials(update.from_user.id) + if creds: + await update.reply_text(f"Your credentials:\nUsername: {creds['username']}\nPassword: {creds['password']}") + else: + await update.reply_text("No credentials found.") + +@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth|/remove_auth|/view_auth')) async def webdl(_, m): url = m.text.strip() @@ -77,7 +95,7 @@ async def webdl(_, m): [ InlineKeyboardButton("XML", callback_data=f"x|{url}"), InlineKeyboardButton("Video", callback_data=f"v|{url}"), - InlineKeyboardButton("JavaScript", callback_data=f"j|{url}") + InlineKeyboardButton("JS", callback_data=f"j|{url}") ] ] ) From 63476d3caf7ea28c6137c4f35ba5e4346d59bac1 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:44:24 +0530 Subject: [PATCH 50/66] Update main.py --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 8038486..4d04155 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ from pyrogram import Client, filters from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery from web_dl import urlDownloader -from auth import add_credentials, get_credentials, remove_credentials, get_all_credentials +from auth import add_credentials, get_credentials, remove_credentials import asyncio # Bot configuration using environment variables From 2b14828789581f3dbecb5ef1e4b3d5259dfadd5c Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:45:26 +0530 Subject: [PATCH 51/66] Update web_dl.py --- web_dl.py | 88 +++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) diff --git a/web_dl.py b/web_dl.py index e1cfefa..b770a8b 100644 --- a/web_dl.py +++ b/web_dl.py @@ -54,51 +54,49 @@ def savePage(self, url, pagefolder='page'): f"{self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls.") return True, summary except Exception as e: - print(f"> savePage(): Create files failed: {str(e)}.", file=sys.stderr) - return False, None - - def _download_file(self, fileurl, filepath): - """Download a file with retry mechanism.""" - for attempt in range(self.max_retries): - try: - filebin = self.session.get(fileurl, stream=True, auth=self.auth) - filebin.raise_for_status() - if self.file_size_limit and int(filebin.headers.get('content-length', 0)) > self.file_size_limit: - print(f"File {fileurl} exceeds the size limit.", file=sys.stderr) - return False - with open(filepath, 'wb') as file: - for chunk in filebin.iter_content(chunk_size=8192): - if chunk: - file.write(chunk) - print(f"Successfully downloaded {fileurl} to {filepath}") # Debug statement - return True - except requests.RequestException as exc: - print(f"Attempt {attempt + 1} failed for {fileurl}: {exc}", file=sys.stderr) - return False + print(f"> savePage(): Create page error: {str(e)}") + return False, str(e) def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): - """Saves on specified pagefolder all tag2find objects.""" - pagefolder = os.path.join(pagefolder, tag2find) - if not os.path.exists(pagefolder): - os.mkdir(pagefolder) - elements = self.soup.findAll(tag2find) - if not elements: - print(f"No {tag2find} elements found.", file=sys.stderr) - with ThreadPoolExecutor(max_workers=4) as executor: + """Find and save the components from the soup object.""" + folder = os.path.join(pagefolder, category) + if not os.path.exists(folder): + os.mkdir(folder) + with ThreadPoolExecutor(max_workers=10) as executor: futures = [] - for res in tqdm(elements, desc=f"Downloading {tag2find}"): - if not res.has_attr(inner): - continue - filename = re.sub(r'\W+', '.', os.path.basename(res[inner])) - if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)): - filename += '.html' - fileurl = urljoin(url, res.get(inner)) - filepath = os.path.join(pagefolder, filename) - res[inner] = os.path.join(os.path.basename(pagefolder), filename) - if not os.path.isfile(filepath): - print(f"Downloading {fileurl} to {filepath}") # Debug statement - futures.append(executor.submit(self._download_file, fileurl, filepath)) - for future in futures: - if future.result(): - self.summary[category] += 1 - print(f"Completed downloading {tag2find} elements. Total: {self.summary[category]}") # Debug statement + for tag in self.soup.find_all(tag2find): + try: + turl = tag.get(inner) + if turl is None: + continue + turl = turl.split('?')[0] + filename = os.path.basename(turl).strip().replace(" ", "_") + if len(filename) > 25: + filename = filename[-25:] + savepath = os.path.join(folder, filename) + if not turl.startswith("http"): + turl = urljoin(url, turl) + futures.append(executor.submit(self._download_file, turl, savepath, category)) + except Exception as e: + print(f"> _soupfindnSave(): Inner exception: {str(e)}") + for future in tqdm(futures, desc=f"Downloading {category}"): + try: + future.result() + except Exception as e: + print(f"> _soupfindnSave(): Future exception: {str(e)}") + + def _download_file(self, url, savepath, category): + """Download a file from a URL to a local path.""" + try: + headers = {"User-Agent": "Mozilla/5.0"} + response = self.session.get(url, headers=headers, stream=True, auth=self.auth) + response.raise_for_status() + if self.file_size_limit and int(response.headers.get('content-length', 0)) > self.file_size_limit: + print(f"Skipping {url} due to size limit.") + return + with open(savepath, 'wb') as file: + for chunk in response.iter_content(1024): + file.write(chunk) + self.summary[category] += 1 + except Exception as e: + print(f"> _download_file(): Download error for {url}: {str(e)}") From cb6ac1b2d13fb42039a81378cc04734ca0a1efaf Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:46:01 +0530 Subject: [PATCH 52/66] Update auth.py --- auth.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/auth.py b/auth.py index a4ea522..7f55383 100644 --- a/auth.py +++ b/auth.py @@ -1,5 +1,3 @@ -# auth.py - import os import json @@ -17,7 +15,7 @@ def save_auth_data(data): def add_credentials(user_id, username, password): data = load_auth_data() - data[user_id] = {'username': username, 'password': password} + data[str(user_id)] = {'username': username, 'password': password} save_auth_data(data) def get_credentials(user_id): @@ -29,3 +27,5 @@ def remove_credentials(user_id): if str(user_id) in data: del data[str(user_id)] save_auth_data(data) + return True + return False From 61c55d7cebe960e13e7adfcd6d19daa1bf25be67 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:51:35 +0530 Subject: [PATCH 53/66] Update main.py --- main.py | 41 +++++++++-------------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index 4d04155..8670917 100644 --- a/main.py +++ b/main.py @@ -88,14 +88,14 @@ async def webdl(_, m): keyboard = InlineKeyboardMarkup( [ [ - InlineKeyboardButton("HTML", callback_data=f"h|{url}"), - InlineKeyboardButton("CSS", callback_data=f"c|{url}"), - InlineKeyboardButton("Images", callback_data=f"i|{url}") + InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"), + InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}") ], [ - InlineKeyboardButton("XML", callback_data=f"x|{url}"), - InlineKeyboardButton("Video", callback_data=f"v|{url}"), - InlineKeyboardButton("JS", callback_data=f"j|{url}") + InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"), + InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"), + InlineKeyboardButton("JS", callback_data=f"j|{url[:50]}") ] ] ) @@ -122,23 +122,15 @@ async def callback_query_handler(bot, update: CallbackQuery): if not res: return await update.message.reply('Something went wrong!') + zip_filename = f"{name}.zip" shutil.make_archive(name, 'zip', base_dir=dir) - await update.message.reply_document(name+'.zip', caption=summary) + await update.message.reply_document(zip_filename, caption=summary) shutil.rmtree(dir) - os.remove(name+'.zip') + os.remove(zip_filename) print("Download completed successfully!") # Debug statement -def parse_components(text): - components = text.split() - imgFlg = 'img' in components - linkFlg = 'css' in components - scriptFlg = 'script' in components - videoFlg = 'video' in components - xmlFlg = 'xml' in components - return imgFlg, linkFlg, scriptFlg, videoFlg, xmlFlg - def is_valid_url(url): try: response = requests.head(url, timeout=5) @@ -146,19 +138,4 @@ def is_valid_url(url): except requests.RequestException: return False -async def send_progress(msg, chat_id, initial_text): - try: - for i in range(10): - await asyncio.sleep(1) - try: - await Bot.edit_message_text(chat_id=chat_id, message_id=msg.id, text=f"{initial_text}\nProgress: {i*10}%") - except Exception as e: - if "MESSAGE_ID_INVALID" in str(e): - print(f"Message ID invalid: {e}", file=sys.stderr) - break - print(f"Error updating progress: {e}", file=sys.stderr) - continue - except Exception as e: - print(f"Error in send_progress loop: {e}", file=sys.stderr) - Bot.run() From 5dac34cdbf7ed4e7cfaaa1aa27dbb24abab27ae4 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 11:52:13 +0530 Subject: [PATCH 54/66] Update web_dl.py From 84fd81f4c87ebcbb13de156147d30131ef3d2f74 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 12:06:31 +0530 Subject: [PATCH 55/66] Update web_dl.py --- web_dl.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/web_dl.py b/web_dl.py index b770a8b..54caf38 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,14 +1,12 @@ import os import re -import sys import requests from urllib.parse import urljoin from bs4 import BeautifulSoup from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor -class urlDownloader(object): - """Download the webpage components based on the input URL.""" +class urlDownloader: def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg @@ -31,7 +29,6 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xml } def savePage(self, url, pagefolder='page'): - """Save the web page components based on the input URL and dir name.""" try: response = self.session.get(url, auth=self.auth) response.raise_for_status() @@ -58,7 +55,6 @@ def savePage(self, url, pagefolder='page'): return False, str(e) def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): - """Find and save the components from the soup object.""" folder = os.path.join(pagefolder, category) if not os.path.exists(folder): os.mkdir(folder) @@ -86,7 +82,6 @@ def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category= print(f"> _soupfindnSave(): Future exception: {str(e)}") def _download_file(self, url, savepath, category): - """Download a file from a URL to a local path.""" try: headers = {"User-Agent": "Mozilla/5.0"} response = self.session.get(url, headers=headers, stream=True, auth=self.auth) From c8e4b7f784d65452744eeab88a62b697b0af77b6 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 12:07:07 +0530 Subject: [PATCH 56/66] Update main.py --- main.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index 8670917..3264f37 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,4 @@ import os -import sys import shutil import requests from pyrogram import Client, filters @@ -39,7 +38,7 @@ ]] ) -@Bot.on_message(filters.command(["start"])) +@Bot.on_message(filters.command("start")) async def start(bot, update): text = START_TXT.format(update.from_user.mention) reply_markup = START_BTN @@ -49,7 +48,7 @@ async def start(bot, update): reply_markup=reply_markup ) -@Bot.on_message(filters.command(["auth"])) +@Bot.on_message(filters.command("auth")) async def auth(bot, update): if len(update.command) != 2 or ':' not in update.command[1]: return await update.reply_text("Please send your username and password in the format 'username:password'") @@ -58,7 +57,7 @@ async def auth(bot, update): add_credentials(update.from_user.id, username, password) await update.reply_text("Credentials saved successfully.") -@Bot.on_message(filters.command(["remove_auth"])) +@Bot.on_message(filters.command("remove_auth")) async def remove_auth(bot, update): success = remove_credentials(update.from_user.id) if success: @@ -66,7 +65,7 @@ async def remove_auth(bot, update): else: await update.reply_text("No credentials found to remove.") -@Bot.on_message(filters.command(["view_auth"])) +@Bot.on_message(filters.command("view_auth")) async def view_auth(bot, update): creds = get_credentials(update.from_user.id) if creds: @@ -74,7 +73,7 @@ async def view_auth(bot, update): else: await update.reply_text("No credentials found.") -@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth|/remove_auth|/view_auth')) +@Bot.on_message(filters.private & filters.text & ~filters.command(["start", "auth", "remove_auth", "view_auth"])) async def webdl(_, m): url = m.text.strip() @@ -88,14 +87,14 @@ async def webdl(_, m): keyboard = InlineKeyboardMarkup( [ [ - InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"), - InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"), - InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}") + InlineKeyboardButton("HTML", callback_data=f"h|{url}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url}"), + InlineKeyboardButton("Images", callback_data=f"i|{url}") ], [ - InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"), - InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"), - InlineKeyboardButton("JS", callback_data=f"j|{url[:50]}") + InlineKeyboardButton("XML", callback_data=f"x|{url}"), + InlineKeyboardButton("Video", callback_data=f"v|{url}"), + InlineKeyboardButton("JS", callback_data=f"j|{url}") ] ] ) From 59df0d52c8e358fd931d10b2d19d5b76f9b464d0 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 15:03:03 +0530 Subject: [PATCH 57/66] Update main.py --- main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 3264f37..39ea77d 100644 --- a/main.py +++ b/main.py @@ -110,13 +110,14 @@ async def callback_query_handler(bot, update: CallbackQuery): scriptFlg = component == 'j' videoFlg = component == 'v' xmlFlg = component == 'x' + htmlFlg = component == 'h' name = dir = str(update.message.chat.id) if not os.path.isdir(dir): os.makedirs(dir) auth = get_credentials(update.from_user.id) - obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, file_size_limit=10*1024*1024, auth=auth) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, htmlFlg=htmlFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: return await update.message.reply('Something went wrong!') From faf4e344914191f4ca6ade06c23c5d3eab92ca78 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 15:04:07 +0530 Subject: [PATCH 58/66] Update web_dl.py --- web_dl.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/web_dl.py b/web_dl.py index 54caf38..c96b92d 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,5 +1,4 @@ import os -import re import requests from urllib.parse import urljoin from bs4 import BeautifulSoup @@ -7,13 +6,14 @@ from concurrent.futures import ThreadPoolExecutor class urlDownloader: - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, htmlFlg=False, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg self.videoFlg = videoFlg self.xmlFlg = xmlFlg + self.htmlFlg = htmlFlg self.file_size_limit = file_size_limit self.max_retries = max_retries self.auth = auth @@ -25,7 +25,8 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xml 'links': 0, 'scripts': 0, 'videos': 0, - 'xmls': 0 + 'xmls': 0, + 'htmls': 0 } def savePage(self, url, pagefolder='page'): @@ -45,10 +46,11 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos') if self.xmlFlg: self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') - with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: - file.write(self.soup.prettify('utf-8')) - summary = (f"Downloaded: {self.summary['images']} images, {self.summary['links']} links, " - f"{self.summary['scripts']} scripts, {self.summary['videos']} videos, {self.summary['xmls']} xmls.") + if self.htmlFlg: + with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: + file.write(self.soup.prettify('utf-8')) + self.summary['htmls'] += 1 + summary = self.generate_summary() return True, summary except Exception as e: print(f"> savePage(): Create page error: {str(e)}") @@ -95,3 +97,7 @@ def _download_file(self, url, savepath, category): self.summary[category] += 1 except Exception as e: print(f"> _download_file(): Download error for {url}: {str(e)}") + + def generate_summary(self): + components = [f"{count} {name}" for name, count in self.summary.items() if count > 0] + return f"Downloaded: {', '.join(components)}." From c1b72e63d36cf8e6cbcee6d204af95c3c040e5d4 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 15:11:52 +0530 Subject: [PATCH 59/66] Update main.py --- main.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index 39ea77d..8d20463 100644 --- a/main.py +++ b/main.py @@ -27,8 +27,8 @@ Send any URL, optionally with the components you want to download. For example: 'https://www.google.com img,css,script' -Use /auth username:password to add your authentication credentials. -Use /remove_auth to remove your authentication credentials. +Use /auth website_url username:password to add your authentication credentials. +Use /remove_auth website_url to remove your authentication credentials. Use /view_auth to view your stored authentication credentials. """ @@ -50,16 +50,21 @@ async def start(bot, update): @Bot.on_message(filters.command("auth")) async def auth(bot, update): - if len(update.command) != 2 or ':' not in update.command[1]: - return await update.reply_text("Please send your username and password in the format 'username:password'") + if len(update.command) != 3 or ':' not in update.command[2]: + return await update.reply_text("Please send your website URL and credentials in the format 'website_url username:password'") - username, password = update.command[1].split(":", 1) - add_credentials(update.from_user.id, username, password) + website, credentials = update.command[1], update.command[2] + username, password = credentials.split(":", 1) + add_credentials(update.from_user.id, website, username, password) await update.reply_text("Credentials saved successfully.") @Bot.on_message(filters.command("remove_auth")) async def remove_auth(bot, update): - success = remove_credentials(update.from_user.id) + if len(update.command) != 2: + return await update.reply_text("Please send the website URL in the format 'website_url'") + + website = update.command[1] + success = remove_credentials(update.from_user.id, website) if success: await update.reply_text("Credentials removed successfully.") else: @@ -69,7 +74,8 @@ async def remove_auth(bot, update): async def view_auth(bot, update): creds = get_credentials(update.from_user.id) if creds: - await update.reply_text(f"Your credentials:\nUsername: {creds['username']}\nPassword: {creds['password']}") + cred_list = [f"Website: {website}\nUsername: {details['username']}\nPassword: {details['password']}" for website, details in creds.items()] + await update.reply_text("\n\n".join(cred_list)) else: await update.reply_text("No credentials found.") From 07f11010e0bae5ef23f0387948f983e633063779 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 15:13:16 +0530 Subject: [PATCH 60/66] Update auth.py --- auth.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/auth.py b/auth.py index 7f55383..4e9ded9 100644 --- a/auth.py +++ b/auth.py @@ -13,19 +13,21 @@ def save_auth_data(data): with open(AUTH_FILE, 'w') as file: json.dump(data, file, indent=4) -def add_credentials(user_id, username, password): +def add_credentials(user_id, website, username, password): data = load_auth_data() - data[str(user_id)] = {'username': username, 'password': password} + if str(user_id) not in data: + data[str(user_id)] = {} + data[str(user_id)][website] = {'username': username, 'password': password} save_auth_data(data) def get_credentials(user_id): data = load_auth_data() - return data.get(str(user_id)) + return data.get(str(user_id), {}) -def remove_credentials(user_id): +def remove_credentials(user_id, website): data = load_auth_data() - if str(user_id) in data: - del data[str(user_id)] + if str(user_id) in data and website in data[str(user_id)]: + del data[str(user_id)][website] save_auth_data(data) return True return False From a27a4017bc08fc0f6b589f3fa0309214a982a90d Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 15:14:11 +0530 Subject: [PATCH 61/66] Update web_dl.py From bd11e2843b2388dcce9a42469e6083a930d2f1f6 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 16:18:00 +0530 Subject: [PATCH 62/66] Update main.py --- main.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 8d20463..aa63dc1 100644 --- a/main.py +++ b/main.py @@ -139,9 +139,20 @@ async def callback_query_handler(bot, update: CallbackQuery): def is_valid_url(url): try: - response = requests.head(url, timeout=5) - return response.status_code == 200 - except requests.RequestException: - return False - + headers = {"User-Agent": "Mozilla/5.0"} + response = requests.head(url, headers=headers, timeout=10, allow_redirects=True) + if response.status_code == 200: + return True + print(f"HEAD request failed with status code: {response.status_code}") + print(f"Response headers: {response.headers}") + # Fallback to GET request if HEAD fails + response = requests.get(url, headers=headers, timeout=10, allow_redirects=True) + if response.status_code == 200: + return True + print(f"GET request failed with status code: {response.status_code}") + print(f"Response headers: {response.headers}") + except requests.RequestException as e: + print(f"Request exception: {e}") + return False + Bot.run() From 41dff1a89880dcdddfc25c28ea11c75018fc7b59 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 16:23:36 +0530 Subject: [PATCH 63/66] Update main.py --- main.py | 67 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index aa63dc1..8075859 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,6 @@ from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery from web_dl import urlDownloader from auth import add_credentials, get_credentials, remove_credentials -import asyncio # Bot configuration using environment variables BOT_TOKEN = os.environ.get("BOT_TOKEN") @@ -27,8 +26,8 @@ Send any URL, optionally with the components you want to download. For example: 'https://www.google.com img,css,script' -Use /auth website_url username:password to add your authentication credentials. -Use /remove_auth website_url to remove your authentication credentials. +Use /auth username:password to add your authentication credentials. +Use /remove_auth to remove your authentication credentials. Use /view_auth to view your stored authentication credentials. """ @@ -38,7 +37,7 @@ ]] ) -@Bot.on_message(filters.command("start")) +@Bot.on_message(filters.command(["start"])) async def start(bot, update): text = START_TXT.format(update.from_user.mention) reply_markup = START_BTN @@ -48,38 +47,32 @@ async def start(bot, update): reply_markup=reply_markup ) -@Bot.on_message(filters.command("auth")) +@Bot.on_message(filters.command(["auth"])) async def auth(bot, update): - if len(update.command) != 3 or ':' not in update.command[2]: - return await update.reply_text("Please send your website URL and credentials in the format 'website_url username:password'") + if len(update.command) != 2 or ':' not in update.command[1]: + return await update.reply_text("Please send your username and password in the format 'username:password'") - website, credentials = update.command[1], update.command[2] - username, password = credentials.split(":", 1) - add_credentials(update.from_user.id, website, username, password) + username, password = update.command[1].split(":", 1) + add_credentials(update.from_user.id, username, password) await update.reply_text("Credentials saved successfully.") -@Bot.on_message(filters.command("remove_auth")) +@Bot.on_message(filters.command(["remove_auth"])) async def remove_auth(bot, update): - if len(update.command) != 2: - return await update.reply_text("Please send the website URL in the format 'website_url'") - - website = update.command[1] - success = remove_credentials(update.from_user.id, website) + success = remove_credentials(update.from_user.id) if success: await update.reply_text("Credentials removed successfully.") else: await update.reply_text("No credentials found to remove.") -@Bot.on_message(filters.command("view_auth")) +@Bot.on_message(filters.command(["view_auth"])) async def view_auth(bot, update): creds = get_credentials(update.from_user.id) if creds: - cred_list = [f"Website: {website}\nUsername: {details['username']}\nPassword: {details['password']}" for website, details in creds.items()] - await update.reply_text("\n\n".join(cred_list)) + await update.reply_text(f"Your credentials:\nUsername: {creds['username']}\nPassword: {creds['password']}") else: await update.reply_text("No credentials found.") -@Bot.on_message(filters.private & filters.text & ~filters.command(["start", "auth", "remove_auth", "view_auth"])) +@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth|/remove_auth|/view_auth')) async def webdl(_, m): url = m.text.strip() @@ -93,14 +86,14 @@ async def webdl(_, m): keyboard = InlineKeyboardMarkup( [ [ - InlineKeyboardButton("HTML", callback_data=f"h|{url}"), - InlineKeyboardButton("CSS", callback_data=f"c|{url}"), - InlineKeyboardButton("Images", callback_data=f"i|{url}") + InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"), + InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"), + InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}") ], [ - InlineKeyboardButton("XML", callback_data=f"x|{url}"), - InlineKeyboardButton("Video", callback_data=f"v|{url}"), - InlineKeyboardButton("JS", callback_data=f"j|{url}") + InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"), + InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"), + InlineKeyboardButton("JS", callback_data=f"j|{url[:50]}") ] ] ) @@ -116,24 +109,34 @@ async def callback_query_handler(bot, update: CallbackQuery): scriptFlg = component == 'j' videoFlg = component == 'v' xmlFlg = component == 'x' - htmlFlg = component == 'h' name = dir = str(update.message.chat.id) if not os.path.isdir(dir): os.makedirs(dir) auth = get_credentials(update.from_user.id) - obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, htmlFlg=htmlFlg, file_size_limit=10*1024*1024, auth=auth) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: return await update.message.reply('Something went wrong!') zip_filename = f"{name}.zip" shutil.make_archive(name, 'zip', base_dir=dir) - await update.message.reply_document(zip_filename, caption=summary) - shutil.rmtree(dir) - os.remove(zip_filename) + try: + await update.message.reply_document(zip_filename, caption=summary) + except Exception as e: + print(f"Failed to send document: {e}") + + try: + shutil.rmtree(dir) + except Exception as e: + print(f"Failed to remove directory {dir}: {e}") + + try: + os.remove(zip_filename) + except Exception as e: + print(f"Failed to remove zip file {zip_filename}: {e}") print("Download completed successfully!") # Debug statement @@ -154,5 +157,5 @@ def is_valid_url(url): except requests.RequestException as e: print(f"Request exception: {e}") return False - + Bot.run() From 7fb50aa96510a822fe7cc8fd970a321a9606be53 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 16:24:06 +0530 Subject: [PATCH 64/66] Update web_dl.py --- web_dl.py | 87 +++++++++++++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/web_dl.py b/web_dl.py index c96b92d..31a2bb4 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,19 +1,20 @@ import os +import re import requests from urllib.parse import urljoin from bs4 import BeautifulSoup from tqdm import tqdm from concurrent.futures import ThreadPoolExecutor -class urlDownloader: - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, htmlFlg=False, file_size_limit=None, max_retries=3, auth=None): +class urlDownloader(object): + """Download the webpage components based on the input URL.""" + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg self.videoFlg = videoFlg self.xmlFlg = xmlFlg - self.htmlFlg = htmlFlg self.file_size_limit = file_size_limit self.max_retries = max_retries self.auth = auth @@ -25,11 +26,11 @@ def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xml 'links': 0, 'scripts': 0, 'videos': 0, - 'xmls': 0, - 'htmls': 0 + 'xmls': 0 } def savePage(self, url, pagefolder='page'): + """Save the web page components based on the input URL and dir name.""" try: response = self.session.get(url, auth=self.auth) response.raise_for_status() @@ -46,58 +47,48 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos') if self.xmlFlg: self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') - if self.htmlFlg: - with open(os.path.join(pagefolder, 'page.html'), 'wb') as file: - file.write(self.soup.prettify('utf-8')) - self.summary['htmls'] += 1 - summary = self.generate_summary() - return True, summary + with open(os.path.join(pagefolder, 'index.html'), 'w', encoding='utf-8') as f: + f.write(self.soup.prettify()) + summary_text = "\n".join([f"{k}: {v}" for k, v in self.summary.items()]) + return True, summary_text except Exception as e: - print(f"> savePage(): Create page error: {str(e)}") - return False, str(e) + print(f"Error saving page: {e}") + return False, "" def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'): + """Find and save specific elements in the soup.""" + tags = self.soup.find_all(tag2find) + print(f"Found {len(tags)} {category} tags") # Debug statement + urls = [tag.get(inner) for tag in tags] + urls = [urljoin(url, u) for u in urls] + urls = list(set(urls)) + self.summary[category] += len(urls) folder = os.path.join(pagefolder, category) if not os.path.exists(folder): os.mkdir(folder) with ThreadPoolExecutor(max_workers=10) as executor: - futures = [] - for tag in self.soup.find_all(tag2find): - try: - turl = tag.get(inner) - if turl is None: - continue - turl = turl.split('?')[0] - filename = os.path.basename(turl).strip().replace(" ", "_") - if len(filename) > 25: - filename = filename[-25:] - savepath = os.path.join(folder, filename) - if not turl.startswith("http"): - turl = urljoin(url, turl) - futures.append(executor.submit(self._download_file, turl, savepath, category)) - except Exception as e: - print(f"> _soupfindnSave(): Inner exception: {str(e)}") - for future in tqdm(futures, desc=f"Downloading {category}"): - try: - future.result() - except Exception as e: - print(f"> _soupfindnSave(): Future exception: {str(e)}") + for u in urls: + executor.submit(self._savefile, folder, u) - def _download_file(self, url, savepath, category): + def _savefile(self, folder, fileurl): + """Save the file content from the URL to the given folder.""" + if not fileurl: + return + name = re.sub(r'\W+', '', os.path.basename(fileurl)) + filename = os.path.join(folder, name) + print(f"Downloading {fileurl} to {filename}") # Debug statement try: - headers = {"User-Agent": "Mozilla/5.0"} - response = self.session.get(url, headers=headers, stream=True, auth=self.auth) + response = self.session.get(fileurl, stream=True, auth=self.auth) response.raise_for_status() - if self.file_size_limit and int(response.headers.get('content-length', 0)) > self.file_size_limit: - print(f"Skipping {url} due to size limit.") + content_length = response.headers.get('Content-Length') + if content_length and self.file_size_limit and int(content_length) > self.file_size_limit: + print(f"Skipping {fileurl}, file size {content_length} exceeds limit {self.file_size_limit}") return - with open(savepath, 'wb') as file: - for chunk in response.iter_content(1024): - file.write(chunk) - self.summary[category] += 1 + with open(filename, 'wb') as f: + for chunk in tqdm(response.iter_content(chunk_size=1024)): + if chunk: + f.write(chunk) except Exception as e: - print(f"> _download_file(): Download error for {url}: {str(e)}") - - def generate_summary(self): - components = [f"{count} {name}" for name, count in self.summary.items() if count > 0] - return f"Downloaded: {', '.join(components)}." + print(f"Error downloading {fileurl}: {e}") + if os.path.exists(filename): + os.remove(filename) From f3d4e0552b96e087bc0ddb1d99d43575c5bd1379 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 16:31:01 +0530 Subject: [PATCH 65/66] Update main.py --- main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 8075859..ed520d5 100644 --- a/main.py +++ b/main.py @@ -109,13 +109,14 @@ async def callback_query_handler(bot, update: CallbackQuery): scriptFlg = component == 'j' videoFlg = component == 'v' xmlFlg = component == 'x' + htmlFlg = component == 'h' # Adding HTML flag here name = dir = str(update.message.chat.id) if not os.path.isdir(dir): os.makedirs(dir) auth = get_credentials(update.from_user.id) - obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, file_size_limit=10*1024*1024, auth=auth) + obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, htmlFlg=htmlFlg, file_size_limit=10*1024*1024, auth=auth) res, summary = obj.savePage(url, dir) if not res: return await update.message.reply('Something went wrong!') From 208bd25f918280fb64c68d68857dc5273d05f154 Mon Sep 17 00:00:00 2001 From: Md Matin Ashraf <91468485+Ashrafmdmatin41@users.noreply.github.com> Date: Sat, 25 May 2024 16:31:37 +0530 Subject: [PATCH 66/66] Update web_dl.py --- web_dl.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/web_dl.py b/web_dl.py index 31a2bb4..81445d8 100644 --- a/web_dl.py +++ b/web_dl.py @@ -1,5 +1,6 @@ import os import re +import sys import requests from urllib.parse import urljoin from bs4 import BeautifulSoup @@ -8,26 +9,28 @@ class urlDownloader(object): """Download the webpage components based on the input URL.""" - def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, file_size_limit=None, max_retries=3, auth=None): + def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, htmlFlg=True, file_size_limit=None, max_retries=3, auth=None): self.soup = None self.imgFlg = imgFlg self.linkFlg = linkFlg self.scriptFlg = scriptFlg self.videoFlg = videoFlg self.xmlFlg = xmlFlg + self.htmlFlg = htmlFlg self.file_size_limit = file_size_limit self.max_retries = max_retries self.auth = auth self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js') self.videoType = ('mp4', 'webm', 'ogg') - self.session = requests.Session() self.summary = { 'images': 0, 'links': 0, 'scripts': 0, 'videos': 0, - 'xmls': 0 + 'xmls': 0, + 'htmls': 0 } + self.session = requests.Session() def savePage(self, url, pagefolder='page'): """Save the web page components based on the input URL and dir name.""" @@ -47,6 +50,8 @@ def savePage(self, url, pagefolder='page'): self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos') if self.xmlFlg: self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls') + if self.htmlFlg: + self._soupfindnSave(url, pagefolder, tag2find='html', inner='src', category='htmls') with open(os.path.join(pagefolder, 'index.html'), 'w', encoding='utf-8') as f: f.write(self.soup.prettify()) summary_text = "\n".join([f"{k}: {v}" for k, v in self.summary.items()])