Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
86031e1
Update main.py
TelegramBotFather May 24, 2024
a944b10
Update web_dl.py
TelegramBotFather May 24, 2024
7ac4154
Update web_dl.py
TelegramBotFather May 24, 2024
9c0ae8d
Update web_dl.py
TelegramBotFather May 25, 2024
3775d70
Update requirements.txt
TelegramBotFather May 25, 2024
498e3bb
Update web_dl.py
TelegramBotFather May 25, 2024
654276b
Update requirements.txt
TelegramBotFather May 25, 2024
5cee0c3
Update web_dl.py
TelegramBotFather May 25, 2024
fa0e3d9
Update main.py
TelegramBotFather May 25, 2024
1f4f57b
Update main.py
TelegramBotFather May 25, 2024
c7d76a6
Update web_dl.py
TelegramBotFather May 25, 2024
59ebbfc
Update main.py
TelegramBotFather May 25, 2024
d216f9b
Create auth.py
TelegramBotFather May 25, 2024
025cf5d
Update main.py
TelegramBotFather May 25, 2024
bbefcaf
Update main.py
TelegramBotFather May 25, 2024
89e6de7
Update main.py
TelegramBotFather May 25, 2024
8dff917
Update main.py
TelegramBotFather May 25, 2024
8844de0
Update main.py
TelegramBotFather May 25, 2024
5c671bf
Update main.py
TelegramBotFather May 25, 2024
1e52015
Update main.py
TelegramBotFather May 25, 2024
e7fbd27
Update web_dl.py
TelegramBotFather May 25, 2024
b6d509b
Update main.py
TelegramBotFather May 25, 2024
7afa77f
Update web_dl.py
TelegramBotFather May 25, 2024
ca9b363
Update main.py
TelegramBotFather May 25, 2024
7ce16eb
Update web_dl.py
TelegramBotFather May 25, 2024
df858ba
Update main.py
TelegramBotFather May 25, 2024
df90085
Update web_dl.py
TelegramBotFather May 25, 2024
b9bb481
Update main.py
TelegramBotFather May 25, 2024
d8dc8a4
Update main.py
TelegramBotFather May 25, 2024
8bf8a40
Update main.py
TelegramBotFather May 25, 2024
8eeaa05
Update web_dl.py
TelegramBotFather May 25, 2024
2b8ec67
Update main.py
TelegramBotFather May 25, 2024
87ac101
Update web_dl.py
TelegramBotFather May 25, 2024
a4aa730
Update main.py
TelegramBotFather May 25, 2024
0e3df0c
Update main.py
TelegramBotFather May 25, 2024
a5d3f2f
Update main.py
TelegramBotFather May 25, 2024
dc1de97
Update main.py
TelegramBotFather May 25, 2024
630622a
Update main.py
TelegramBotFather May 25, 2024
69337d1
Update web_dl.py
TelegramBotFather May 25, 2024
efe5cd8
Update web_dl.py
TelegramBotFather May 25, 2024
3ad6480
Update main.py
TelegramBotFather May 25, 2024
67c0ea3
Update main.py
TelegramBotFather May 25, 2024
32a0f9b
Update main.py
TelegramBotFather May 25, 2024
7fdf7eb
Update main.py
TelegramBotFather May 25, 2024
0ecca27
Update web_dl.py
TelegramBotFather May 25, 2024
3bb4487
Update main.py
TelegramBotFather May 25, 2024
26ed465
Update web_dl.py
TelegramBotFather May 25, 2024
6d5e14f
Update web_dl.py
TelegramBotFather May 25, 2024
12e655c
Update main.py
TelegramBotFather May 25, 2024
63476d3
Update main.py
TelegramBotFather May 25, 2024
2b14828
Update web_dl.py
TelegramBotFather May 25, 2024
cb6ac1b
Update auth.py
TelegramBotFather May 25, 2024
61c55d7
Update main.py
TelegramBotFather May 25, 2024
5dac34c
Update web_dl.py
TelegramBotFather May 25, 2024
84fd81f
Update web_dl.py
TelegramBotFather May 25, 2024
c8e4b7f
Update main.py
TelegramBotFather May 25, 2024
59df0d5
Update main.py
TelegramBotFather May 25, 2024
faf4e34
Update web_dl.py
TelegramBotFather May 25, 2024
c1b72e6
Update main.py
TelegramBotFather May 25, 2024
07f1101
Update auth.py
TelegramBotFather May 25, 2024
a27a401
Update web_dl.py
TelegramBotFather May 25, 2024
bd11e28
Update main.py
TelegramBotFather May 25, 2024
41dff1a
Update main.py
TelegramBotFather May 25, 2024
7fb50aa
Update web_dl.py
TelegramBotFather May 25, 2024
f3d4e05
Update main.py
TelegramBotFather May 25, 2024
208bd25
Update web_dl.py
TelegramBotFather May 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions auth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import json

AUTH_FILE = 'auth.json'

def load_auth_data():
if os.path.exists(AUTH_FILE):
with open(AUTH_FILE, 'r') as file:
return json.load(file)
return {}

def save_auth_data(data):
with open(AUTH_FILE, 'w') as file:
json.dump(data, file, indent=4)

def add_credentials(user_id, website, username, password):
data = load_auth_data()
if str(user_id) not in data:
data[str(user_id)] = {}
data[str(user_id)][website] = {'username': username, 'password': password}
save_auth_data(data)

def get_credentials(user_id):
data = load_auth_data()
return data.get(str(user_id), {})

def remove_credentials(user_id, website):
data = load_auth_data()
if str(user_id) in data and website in data[str(user_id)]:
del data[str(user_id)][website]
save_auth_data(data)
return True
return False
147 changes: 118 additions & 29 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,41 @@
import os
import shutil
from web_dl import urlDownloader
import requests
from pyrogram import Client, filters
from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton
from pyrogram.types import InlineKeyboardMarkup, InlineKeyboardButton, CallbackQuery
from web_dl import urlDownloader
from auth import add_credentials, get_credentials, remove_credentials

# Bot configuration using environment variables
BOT_TOKEN = os.environ.get("BOT_TOKEN")
API_ID = os.environ.get("API_ID")
API_HASH = os.environ.get("API_HASH")

Bot = Client(
"WebDL-Bot",
bot_token = BOT_TOKEN,
api_id = API_ID,
api_hash = API_HASH
bot_token=BOT_TOKEN,
api_id=API_ID,
api_hash=API_HASH
)

START_TXT = """
Hi {}, I am Web Downloader Bot.

I can download all the components (.html, .css, img, xml, video, javascript..) from URLs.

Send any URL,
for ex: 'https://www.google.com'
Send any URL, optionally with the components you want to download. For example:
'https://www.google.com img,css,script'

Use /auth username:password to add your authentication credentials.
Use /remove_auth to remove your authentication credentials.
Use /view_auth to view your stored authentication credentials.
"""

START_BTN = InlineKeyboardMarkup(
[[
[[
InlineKeyboardButton('Source Code', url='https://github.com/samadii/WebDownloaderBot'),
]]
)

]]
)

@Bot.on_message(filters.command(["start"]))
async def start(bot, update):
Expand All @@ -41,33 +47,116 @@ async def start(bot, update):
reply_markup=reply_markup
)




@Bot.on_message(filters.private & filters.text & ~filters.regex('/start'))
@Bot.on_message(filters.command(["auth"]))
async def auth(bot, update):
if len(update.command) != 2 or ':' not in update.command[1]:
return await update.reply_text("Please send your username and password in the format 'username:password'")

username, password = update.command[1].split(":", 1)
add_credentials(update.from_user.id, username, password)
await update.reply_text("Credentials saved successfully.")

@Bot.on_message(filters.command(["remove_auth"]))
async def remove_auth(bot, update):
success = remove_credentials(update.from_user.id)
if success:
await update.reply_text("Credentials removed successfully.")
else:
await update.reply_text("No credentials found to remove.")

@Bot.on_message(filters.command(["view_auth"]))
async def view_auth(bot, update):
creds = get_credentials(update.from_user.id)
if creds:
await update.reply_text(f"Your credentials:\nUsername: {creds['username']}\nPassword: {creds['password']}")
else:
await update.reply_text("No credentials found.")

@Bot.on_message(filters.private & filters.text & ~filters.regex('/start|/auth|/remove_auth|/view_auth'))
async def webdl(_, m):
url = m.text.strip()

if not url.startswith('http'):
return await m.reply("The URL must start with 'http' or 'https'")

if not is_valid_url(url):
return await m.reply("The URL is invalid or inaccessible")

# Show buttons for selecting components to download
keyboard = InlineKeyboardMarkup(
[
[
InlineKeyboardButton("HTML", callback_data=f"h|{url[:50]}"),
InlineKeyboardButton("CSS", callback_data=f"c|{url[:50]}"),
InlineKeyboardButton("Images", callback_data=f"i|{url[:50]}")
],
[
InlineKeyboardButton("XML", callback_data=f"x|{url[:50]}"),
InlineKeyboardButton("Video", callback_data=f"v|{url[:50]}"),
InlineKeyboardButton("JS", callback_data=f"j|{url[:50]}")
]
]
)
await m.reply("Please select which components to download:", reply_markup=keyboard)

if not m.text.startswith('http'):
return await m.reply("the URL must start with 'http' or 'https'")
@Bot.on_callback_query()
async def callback_query_handler(bot, update: CallbackQuery):
data = update.data
component, url = data.split('|', 1)

msg = await m.reply('Processing..')
url = m.text
name = dir = str(m.chat.id)
imgFlg = component == 'i'
linkFlg = component == 'c'
scriptFlg = component == 'j'
videoFlg = component == 'v'
xmlFlg = component == 'x'
htmlFlg = component == 'h' # Adding HTML flag here

name = dir = str(update.message.chat.id)
if not os.path.isdir(dir):
os.makedirs(dir)

obj = urlDownloader(imgFlg=True, linkFlg=True, scriptFlg=True)
res = obj.savePage(url, dir)
auth = get_credentials(update.from_user.id)
obj = urlDownloader(imgFlg=imgFlg, linkFlg=linkFlg, scriptFlg=scriptFlg, videoFlg=videoFlg, xmlFlg=xmlFlg, htmlFlg=htmlFlg, file_size_limit=10*1024*1024, auth=auth)
res, summary = obj.savePage(url, dir)
if not res:
return await msg.edit_text('something went wrong!')
return await update.message.reply('Something went wrong!')

zip_filename = f"{name}.zip"
shutil.make_archive(name, 'zip', base_dir=dir)
await m.reply_document(name+'.zip')
await msg.delete()

shutil.rmtree(dir)
os.remove(name+'.zip')


try:
await update.message.reply_document(zip_filename, caption=summary)
except Exception as e:
print(f"Failed to send document: {e}")

try:
shutil.rmtree(dir)
except Exception as e:
print(f"Failed to remove directory {dir}: {e}")

try:
os.remove(zip_filename)
except Exception as e:
print(f"Failed to remove zip file {zip_filename}: {e}")

print("Download completed successfully!") # Debug statement

def is_valid_url(url):
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
if response.status_code == 200:
return True
print(f"HEAD request failed with status code: {response.status_code}")
print(f"Response headers: {response.headers}")
# Fallback to GET request if HEAD fails
response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
if response.status_code == 200:
return True
print(f"GET request failed with status code: {response.status_code}")
print(f"Response headers: {response.headers}")
except requests.RequestException as e:
print(f"Request exception: {e}")
return False

Bot.run()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ requests
lxml
urllib3
bs4
tqdm
136 changes: 83 additions & 53 deletions web_dl.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,99 @@
import os, sys
import os
import re
import sys
import requests
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
class urlDownloader(object):
""" Download the webpage components base on the input url."""
def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True):
"""Download the webpage components based on the input URL."""
def __init__(self, imgFlg=True, linkFlg=True, scriptFlg=True, videoFlg=True, xmlFlg=True, htmlFlg=True, file_size_limit=None, max_retries=3, auth=None):
self.soup = None
self.imgFlg = imgFlg
self.linkFlg = linkFlg
self.scriptFlg = scriptFlg
self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml','js')
self.videoFlg = videoFlg
self.xmlFlg = xmlFlg
self.htmlFlg = htmlFlg
self.file_size_limit = file_size_limit
self.max_retries = max_retries
self.auth = auth
self.linkType = ('css', 'png', 'ico', 'jpg', 'jpeg', 'mov', 'ogg', 'gif', 'xml', 'js')
self.videoType = ('mp4', 'webm', 'ogg')
self.summary = {
'images': 0,
'links': 0,
'scripts': 0,
'videos': 0,
'xmls': 0,
'htmls': 0
}
self.session = requests.Session()

#-----------------------------------------------------------------------------

def savePage(self, url, pagefolder='page'):
""" Save the web page components based on the input url and dir name.
Args:
url ([try]): web url string.
pagefolder (str, optional): path to save the web components.
Returns:
[bool]: whether the components saved the successfully.
"""
"""Save the web page components based on the input URL and dir name."""
try:
response = self.session.get(url)
response = self.session.get(url, auth=self.auth)
response.raise_for_status()
self.soup = BeautifulSoup(response.text, features="lxml")
if not os.path.exists(pagefolder): os.mkdir(pagefolder)
if self.imgFlg: self._soupfindnSave(url, pagefolder, tag2find='img', inner='src')
if self.linkFlg: self._soupfindnSave(url, pagefolder, tag2find='link', inner='href')
if self.scriptFlg: self._soupfindnSave(url, pagefolder, tag2find='script', inner='src')
with open(os.path.join(pagefolder, 'page.html'), 'wb') as file:
file.write(self.soup.prettify('utf-8'))
return True
if not os.path.exists(pagefolder):
os.mkdir(pagefolder)
if self.imgFlg:
self._soupfindnSave(url, pagefolder, tag2find='img', inner='src', category='images')
if self.linkFlg:
self._soupfindnSave(url, pagefolder, tag2find='link', inner='href', category='links')
if self.scriptFlg:
self._soupfindnSave(url, pagefolder, tag2find='script', inner='src', category='scripts')
if self.videoFlg:
self._soupfindnSave(url, pagefolder, tag2find='video', inner='src', category='videos')
if self.xmlFlg:
self._soupfindnSave(url, pagefolder, tag2find='xml', inner='src', category='xmls')
if self.htmlFlg:
self._soupfindnSave(url, pagefolder, tag2find='html', inner='src', category='htmls')
with open(os.path.join(pagefolder, 'index.html'), 'w', encoding='utf-8') as f:
f.write(self.soup.prettify())
summary_text = "\n".join([f"{k}: {v}" for k, v in self.summary.items()])
return True, summary_text
except Exception as e:
print("> savePage(): Create files failed: %s." % str(e))
return False
print(f"Error saving page: {e}")
return False, ""

#-----------------------------------------------------------------------------
def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src'):
""" Saves on specified pagefolder all tag2find objects. """
pagefolder = os.path.join(pagefolder, tag2find)
if not os.path.exists(pagefolder): os.mkdir(pagefolder)
for res in self.soup.findAll(tag2find): # images, css, etc..
try:
if not res.has_attr(inner): continue # check if inner tag (file object) exists
# clean special chars such as '@, # ? <>'
filename = re.sub('\W+', '.', os.path.basename(res[inner]))
# print("> filename:", filename)
# Added the '.html' for the html file in the href
if tag2find == 'link' and (not any(ext in filename for ext in self.linkType)):
filename += '.html'
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
# rename html ref so can move html and folder of files anywhere
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
# create the file.
if not os.path.isfile(filepath):
with open(filepath, 'wb') as file:
filebin = self.session.get(fileurl)
if len(filebin.content) > 0: # filter the empty file(imge not found)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
def _soupfindnSave(self, url, pagefolder, tag2find='img', inner='src', category='images'):
"""Find and save specific elements in the soup."""
tags = self.soup.find_all(tag2find)
print(f"Found {len(tags)} {category} tags") # Debug statement
urls = [tag.get(inner) for tag in tags]
urls = [urljoin(url, u) for u in urls]
urls = list(set(urls))
self.summary[category] += len(urls)
folder = os.path.join(pagefolder, category)
if not os.path.exists(folder):
os.mkdir(folder)
with ThreadPoolExecutor(max_workers=10) as executor:
for u in urls:
executor.submit(self._savefile, folder, u)

def _savefile(self, folder, fileurl):
"""Save the file content from the URL to the given folder."""
if not fileurl:
return
name = re.sub(r'\W+', '', os.path.basename(fileurl))
filename = os.path.join(folder, name)
print(f"Downloading {fileurl} to {filename}") # Debug statement
try:
response = self.session.get(fileurl, stream=True, auth=self.auth)
response.raise_for_status()
content_length = response.headers.get('Content-Length')
if content_length and self.file_size_limit and int(content_length) > self.file_size_limit:
print(f"Skipping {fileurl}, file size {content_length} exceeds limit {self.file_size_limit}")
return
with open(filename, 'wb') as f:
for chunk in tqdm(response.iter_content(chunk_size=1024)):
if chunk:
f.write(chunk)
except Exception as e:
print(f"Error downloading {fileurl}: {e}")
if os.path.exists(filename):
os.remove(filename)