samadii · TelegramBotFather · May 24, 2024 · May 24, 2024 · May 24, 2024 · May 25, 2024
diff --git a/auth.py b/auth.py
@@ -0,0 +1,33 @@
+import os
+import json
+
+AUTH_FILE = 'auth.json'
+
+def load_auth_data():
+    if os.path.exists(AUTH_FILE):
+        with open(AUTH_FILE, 'r') as file:
+            return json.load(file)
+    return {}
+
+def save_auth_data(data):
+    with open(AUTH_FILE, 'w') as file:
+        json.dump(data, file, indent=4)
+
+def add_credentials(user_id, website, username, password):
+    data = load_auth_data()
+    if str(user_id) not in data:
+        data[str(user_id)] = {}
+    data[str(user_id)][website] = {'username': username, 'password': password}
+    save_auth_data(data)
+
+def get_credentials(user_id):
+    data = load_auth_data()
+    return data.get(str(user_id), {})
+
+def remove_credentials(user_id, website):
+    data = load_auth_data()
+    if str(user_id) in data and website in data[str(user_id)]:
+        del data[str(user_id)][website]
+        save_auth_data(data)
+        return True
+    return False
diff --git a/main.py b/main.py
@@ -1,35 +1,41 @@
 import os
 import shutil
-from web_dl import urlDownloader
+import requests
 from pyrogram import Client, filters
-from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton
+from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
+from web_dl import urlDownloader
+from auth import add_credentials, get_credentials, remove_credentials
 
+# Bot configuration using environment variables
 BOT_TOKEN = os.environ.get("BOT_TOKEN")
 API_ID = os.environ.get("API_ID")
 API_HASH = os.environ.get("API_HASH")
 
 Bot = Client(
     "WebDL-Bot",
-    bot_token = BOT_TOKEN,
-    api_id = API_ID,
-    api_hash = API_HASH
+    bot_token=BOT_TOKEN,
+    api_id=API_ID,
+    api_hash=API_HASH
 )
 
 START_TXT = """
 Hi {}, I am Web Downloader Bot.
 
 I can download all the components (.html, .css, img, xml, video, javascript..) from URLs.
 
-Send any URL,
-for ex: 'https://www.google.com'
+Send any URL, optionally with the components you want to download. For example:
+'https://www.google.com img,css,script'
+
+Use /auth username:password to add your authentication credentials.
+Use /remove_auth to remove your authentication credentials.
+Use /view_auth to view your stored authentication credentials.
 """
 
 START_BTN = InlineKeyboardMarkup(
-        [[
+    [[
         InlineKeyboardButton('Source Code', url='https://github.com/samadii/WebDownloaderBot'),
-        ]]
-    )
-
+    ]]
+)
 
 @Bot.on_message(filters.command(["start"]))
 async def start(bot, update):
@@ -41,33 +47,116 @@ async def start(bot, update):
         reply_markup=reply_markup
     )
 
-
-
-
-@Bot.on_message(filters.private & filters.text & ~filters.regex('/start'))
+@Bot.on_message(filters.command(["auth"]))
+async def auth(bot, update):
+    if len(update.command) != 2 or ':' not in update.command[1]:
+        return await update.reply_text("Please send your username and password in the format 'username:password'")
+
+    username, password = update.command[1].split(":", 1)
+    add_credentials(update.from_user.id, username, password)
+    await update.reply_text("Credentials saved successfully.")
+
+@Bot.on_message(filters.command(["remove_auth"]))
+async def remove_auth(bot, update):
+    success = remove_credentials(update.from_user.id)
+    if success:
+        await update.reply_text("Credentials removed successfully.")
+    else:
+        await update.reply_text("No credentials found to remove.")
+
+@Bot.on_message(filters.command(["view_auth"]))
+async def view_auth(bot, update):
+    creds = get_credentials(update.from_user.id)
+    if creds:
+        await update.reply_text(f"Your credentials:\nUsername: {creds['username']}\nPassword: {creds['password']}")
+    else:
+        await update.reply_text("No credentials found.")
+
+@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth|/remove_auth|/view_auth'))
 async def webdl(_, m):
+    url = m.text.strip()
+
+    if not url.startswith('http'):
+        return await m.reply("The URL must start with 'http' or 'https'")
+
+    if not is_valid_url(url):
+        return await m.reply("The URL is invalid or inaccessible")
+
+    # Show buttons for selecting components to download
+    keyboard = InlineKeyboardMarkup(
+        [
+            [
+                InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"),
+                InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"),
+                InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}")
+            ],
+            [
+                InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"),
+                InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"),
+                InlineKeyboardButton("JS", callback_data=f"j|{url[:50]}")
+            ]
+        ]
+    )
+    await m.reply("Please select which components to download:", reply_markup=keyboard)
 
-    if not m.text.startswith('http'):
-        return await m.reply("the URL must start with 'http' or 'https'")
+@Bot.on_callback_query()
+async def callback_query_handler(bot, update: CallbackQuery):
+    data = update.data
+    component, url = data.split('|', 1)
 
-    msg = await m.reply('Processing..')
-    url = m.text
-    name = dir = str(m.chat.id)
+    imgFlg = component == 'i'
+    linkFlg = component == 'c'
+    scriptFlg = component == 'j'
+    videoFlg = component == 'v'
+    xmlFlg = component == 'x'
+    htmlFlg = component == 'h'  # Adding HTML flag here
+
+    name = dir = str(update.message.chat.id)
     if not os.path.isdir(dir):
         os.makedirs(dir)
 
-    obj = urlDownloader(imgFlg=True, linkFlg=True, scriptFlg=True)
-    res = obj.savePage(url, dir)
+    auth = get_credentials(update.from_user.id)
+    obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, htmlFlg=htmlFlg, file_size_limit=10*1024*1024, auth=auth)
+    res, summary = obj.savePage(url, dir)
     if not res:
-        return await msg.edit_text('something went wrong!')
+        return await update.message.reply('Something went wrong!')
 
+    zip_filename = f"{name}.zip"
     shutil.make_archive(name, 'zip', base_dir=dir)
-    await m.reply_document(name+'.zip')
-    await msg.delete()
-
-    shutil.rmtree(dir)
-    os.remove(name+'.zip')
-
 
+    try:
+        await update.message.reply_document(zip_filename, caption=summary)
+    except Exception as e:
+        print(f"Failed to send document: {e}")
+
+    try:
+        shutil.rmtree(dir)
+    except Exception as e:
+        print(f"Failed to remove directory {dir}: {e}")
+
+    try:
+        os.remove(zip_filename)
+    except Exception as e:
+        print(f"Failed to remove zip file {zip_filename}: {e}")
+
+    print("Download completed successfully!")  # Debug statement
+
+def is_valid_url(url):
+    try:
+        headers = {"User-Agent": "Mozilla/5.0"}
+        response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
+        if response.status_code == 200:
+            return True
+        print(f"HEAD request failed with status code: {response.status_code}")
+        print(f"Response headers: {response.headers}")
+        # Fallback to GET request if HEAD fails
+        response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
+        if response.status_code == 200:
+            return True
+        print(f"GET request failed with status code: {response.status_code}")
+        print(f"Response headers: {response.headers}")
+    except requests.RequestException as e:
+        print(f"Request exception: {e}")
+    return False
 
 Bot.run()
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ requests
 lxml
 urllib3
 bs4
+tqdm
diff --git a/web_dl.py b/web_dl.py
@@ -1,69 +1,99 @@
-import os, sys
+import os
 import re
+import sys
 import requests
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin
 from bs4 import BeautifulSoup
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor
 
-
-#-----------------------------------------------------------------------------
-#-----------------------------------------------------------------------------
 class urlDownloader(object):
-    """ Download the webpage components base on the input url."""
-    def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True):
+    """Download the webpage components based on the input URL."""
+    def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, htmlFlg=True, file_size_limit=None, max_retries=3, auth=None):
         self.soup = None
         self.imgFlg = imgFlg
         self.linkFlg = linkFlg
         self.scriptFlg = scriptFlg
-        self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml','js')
+        self.videoFlg = videoFlg
+        self.xmlFlg = xmlFlg
+        self.htmlFlg = htmlFlg
+        self.file_size_limit = file_size_limit
+        self.max_retries = max_retries
+        self.auth = auth
+        self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js')
+        self.videoType = ('mp4', 'webm', 'ogg')
+        self.summary = {
+            'images': 0,
+            'links': 0,
+            'scripts': 0,
+            'videos': 0,
+            'xmls': 0,
+            'htmls': 0
+        }
         self.session = requests.Session()
-
-    #-----------------------------------------------------------------------------
+
     def savePage(self, url, pagefolder='page'):
-        """ Save the web page components based on the input url and dir name.
-        Args:
-            url ([try]): web url string.
-            pagefolder (str, optional): path to save the web components.
-        Returns:
-            [bool]: whether the components saved the successfully.
-        """
+        """Save the web page components based on the input URL and dir name."""
         try:
-            response = self.session.get(url)
+            response = self.session.get(url, auth=self.auth)
+            response.raise_for_status()
             self.soup = BeautifulSoup(response.text, features="lxml")
-            if not os.path.exists(pagefolder): os.mkdir(pagefolder)
-            if self.imgFlg: self._soupfindnSave(url, pagefolder, tag2find='img', inner='src')
-            if self.linkFlg: self._soupfindnSave(url, pagefolder, tag2find='link', inner='href')
-            if self.scriptFlg: self._soupfindnSave(url, pagefolder, tag2find='script', inner='src')
-            with open(os.path.join(pagefolder, 'page.html'), 'wb') as file:
-                file.write(self.soup.prettify('utf-8'))
-            return True
+            if not os.path.exists(pagefolder):
+                os.mkdir(pagefolder)
+            if self.imgFlg:
+                self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images')
+            if self.linkFlg:
+                self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links')
+            if self.scriptFlg:
+                self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts')
+            if self.videoFlg:
+                self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos')
+            if self.xmlFlg:
+                self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls')
+            if self.htmlFlg:
+                self._soupfindnSave(url, pagefolder, tag2find='html', inner='src', category='htmls')
+            with open(os.path.join(pagefolder, 'index.html'), 'w', encoding='utf-8') as f:
+                f.write(self.soup.prettify())
+            summary_text = "\n".join([f"{k}: {v}" for k, v in self.summary.items()])
+            return True, summary_text
         except Exception as e:
-            print("> savePage(): Create files failed: %s." % str(e))
-            return False
+            print(f"Error saving page: {e}")
+            return False, ""
 
-    #-----------------------------------------------------------------------------
-    def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'):
-        """ Saves on specified pagefolder all tag2find objects. """
-        pagefolder = os.path.join(pagefolder, tag2find)
-        if not os.path.exists(pagefolder): os.mkdir(pagefolder)
-        for res in self.soup.findAll(tag2find):   # images, css, etc..
-            try:
-                if not res.has_attr(inner): continue # check if inner tag (file object) exists
-                # clean special chars such as '@, # ? <>'
-                filename = re.sub('\W+', '.', os.path.basename(res[inner]))
-                # print("> filename:", filename)
-                # Added the '.html' for the html file in the href
-                if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)):
-                    filename += '.html'
-                fileurl = urljoin(url, res.get(inner))
-                filepath = os.path.join(pagefolder, filename)
-                # rename html ref so can move html and folder of files anywhere
-                res[inner] = os.path.join(os.path.basename(pagefolder), filename)
-                # create the file.
-                if not os.path.isfile(filepath):
-                    with open(filepath, 'wb') as file:
-                        filebin = self.session.get(fileurl)
-                        if len(filebin.content) > 0: # filter the empty file(imge not found)
-                            file.write(filebin.content)
-            except Exception as exc:
-                print(exc, file=sys.stderr)
+    def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'):
+        """Find and save specific elements in the soup."""
+        tags = self.soup.find_all(tag2find)
+        print(f"Found {len(tags)} {category} tags")  # Debug statement
+        urls = [tag.get(inner) for tag in tags]
+        urls = [urljoin(url, u) for u in urls]
+        urls = list(set(urls))
+        self.summary[category] += len(urls)
+        folder = os.path.join(pagefolder, category)
+        if not os.path.exists(folder):
+            os.mkdir(folder)
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            for u in urls:
+                executor.submit(self._savefile, folder, u)
 
+    def _savefile(self, folder, fileurl):
+        """Save the file content from the URL to the given folder."""
+        if not fileurl:
+            return
+        name = re.sub(r'\W+', '', os.path.basename(fileurl))
+        filename = os.path.join(folder, name)
+        print(f"Downloading {fileurl} to {filename}")  # Debug statement
+        try:
+            response = self.session.get(fileurl, stream=True, auth=self.auth)
+            response.raise_for_status()
+            content_length = response.headers.get('Content-Length')
+            if content_length and self.file_size_limit and int(content_length) > self.file_size_limit:
+                print(f"Skipping {fileurl}, file size {content_length} exceeds limit {self.file_size_limit}")
+                return
+            with open(filename, 'wb') as f:
+                for chunk in tqdm(response.iter_content(chunk_size=1024)):
+                    if chunk:
+                        f.write(chunk)
+        except Exception as e:
+            print(f"Error downloading {fileurl}: {e}")
+            if os.path.exists(filename):
+                os.remove(filename)
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ requests @@
     lxml
     urllib3
     bs4
+    tqdm