From c520fb3a8d64a6dde29d6f8d269213f9a14fbd6e Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sat, 27 Dec 2025 18:39:38 +0100 Subject: [PATCH 01/11] add setup.py --- config.py | 2 -- data/README.md | 3 --- src/substack2markdown/__init__.py | 0 .../substack2markdown/assets/author_template.html | 0 .../substack2markdown/assets}/css/essay-styles.css | 0 .../substack2markdown/assets}/css/style.css | 0 .../substack2markdown/assets}/images/screenshot.png | Bin .../substack2markdown/assets}/js/populate-essays.js | 0 .../substack2markdown/substack_scraper.py | 0 substack_html_pages/README.md | 3 --- 10 files changed, 8 deletions(-) delete mode 100644 config.py delete mode 100644 data/README.md create mode 100644 src/substack2markdown/__init__.py rename author_template.html => src/substack2markdown/assets/author_template.html (100%) rename {assets => src/substack2markdown/assets}/css/essay-styles.css (100%) rename {assets => src/substack2markdown/assets}/css/style.css (100%) rename {assets => src/substack2markdown/assets}/images/screenshot.png (100%) rename {assets => src/substack2markdown/assets}/js/populate-essays.js (100%) rename substack_scraper.py => src/substack2markdown/substack_scraper.py (100%) delete mode 100644 substack_html_pages/README.md diff --git a/config.py b/config.py deleted file mode 100644 index 8fc6bff2..00000000 --- a/config.py +++ /dev/null @@ -1,2 +0,0 @@ -EMAIL = "your-email@domain.com" -PASSWORD = "your-password" diff --git a/data/README.md b/data/README.md deleted file mode 100644 index 27476ca6..00000000 --- a/data/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This directory will be used to store `.json` files for each writer -containing metadata that is used to populate a `.html` file for that -author. \ No newline at end of file diff --git a/src/substack2markdown/__init__.py b/src/substack2markdown/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/author_template.html b/src/substack2markdown/assets/author_template.html similarity index 100% rename from author_template.html rename to src/substack2markdown/assets/author_template.html diff --git a/assets/css/essay-styles.css b/src/substack2markdown/assets/css/essay-styles.css similarity index 100% rename from assets/css/essay-styles.css rename to src/substack2markdown/assets/css/essay-styles.css diff --git a/assets/css/style.css b/src/substack2markdown/assets/css/style.css similarity index 100% rename from assets/css/style.css rename to src/substack2markdown/assets/css/style.css diff --git a/assets/images/screenshot.png b/src/substack2markdown/assets/images/screenshot.png similarity index 100% rename from assets/images/screenshot.png rename to src/substack2markdown/assets/images/screenshot.png diff --git a/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js similarity index 100% rename from assets/js/populate-essays.js rename to src/substack2markdown/assets/js/populate-essays.js diff --git a/substack_scraper.py b/src/substack2markdown/substack_scraper.py similarity index 100% rename from substack_scraper.py rename to src/substack2markdown/substack_scraper.py diff --git a/substack_html_pages/README.md b/substack_html_pages/README.md deleted file mode 100644 index 0931cf8e..00000000 --- a/substack_html_pages/README.md +++ /dev/null @@ -1,3 +0,0 @@ -This directory will be used to store `.html` files for each writer that will enable you -to browse and sort the downloaded markdown files for a given writer. One `.html` file -will be created for each writer. \ No newline at end of file From c73855e0a8aa17b916863c895acf8d6e741a2cf3 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sat, 27 Dec 2025 19:05:33 +0100 Subject: [PATCH 02/11] add parameters: config email password --- src/substack2markdown/substack_scraper.py | 42 ++++++++++++++++++----- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 126d260d..734dc553 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -21,7 +21,6 @@ from selenium.common.exceptions import SessionNotCreatedException from selenium.webdriver.chrome.service import Service from urllib.parse import urlparse -from config import EMAIL, PASSWORD USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown @@ -70,9 +69,10 @@ def generate_html_file(author_name: str) -> None: class BaseSubstackScraper(ABC): - def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str): + def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): if not base_substack_url.endswith("/"): base_substack_url += "/" + self.args = args self.base_substack_url: str = base_substack_url self.writer_name: str = extract_main_part(base_substack_url) @@ -371,8 +371,8 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: class SubstackScraper(BaseSubstackScraper): - def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str): - super().__init__(base_substack_url, md_save_dir, html_save_dir) + def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): + super().__init__(args, base_substack_url, md_save_dir, html_save_dir) def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: """ @@ -392,6 +392,7 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]: class PremiumSubstackScraper(BaseSubstackScraper): def __init__( self, + args, base_substack_url: str, md_save_dir: str, html_save_dir: str, @@ -400,7 +401,7 @@ def __init__( edge_driver_path: str = '', user_agent: str = '' ) -> None: - super().__init__(base_substack_url, md_save_dir, html_save_dir) + super().__init__(args, base_substack_url, md_save_dir, html_save_dir) options = EdgeOptions() if headless: @@ -459,8 +460,8 @@ def login(self) -> None: # Email and password email = self.driver.find_element(By.NAME, "email") password = self.driver.find_element(By.NAME, "password") - email.send_keys(EMAIL) - password.send_keys(PASSWORD) + email.send_keys(self.args.email) + password.send_keys(self.args.password) # Find the submit button and click it. submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button") @@ -494,6 +495,15 @@ def get_url_soup(self, url: str) -> BeautifulSoup: def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Scrape a Substack site.") + parser.add_argument( + "--config", type=str, help="JSON config file with email and password." + ) + parser.add_argument( + "--email", type=str, help="Login E-Mail." + ) + parser.add_argument( + "--password", type=str, help="Login password." + ) parser.add_argument( "-u", "--url", type=str, help="The base URL of the Substack site to scrape." ) @@ -556,17 +566,29 @@ def main(): if args.html_directory is None: args.html_directory = BASE_HTML_DIR + if args.config: + with open(args.config) as f: + config = json.load(f) + args.email = config["email"] + args.password = config["password"] + # TODO more + + assert args.email + assert args.password + if args.url: if args.premium: scraper = PremiumSubstackScraper( - args.url, + args=args, + base_substack_url=args.url, headless=args.headless, md_save_dir=args.directory, html_save_dir=args.html_directory ) else: scraper = SubstackScraper( - args.url, + args=args, + base_substack_url=args.url, md_save_dir=args.directory, html_save_dir=args.html_directory ) @@ -575,6 +597,7 @@ def main(): else: # Use the hardcoded values at the top of the file if USE_PREMIUM: scraper = PremiumSubstackScraper( + args=args, base_substack_url=BASE_SUBSTACK_URL, md_save_dir=args.directory, html_save_dir=args.html_directory, @@ -583,6 +606,7 @@ def main(): ) else: scraper = SubstackScraper( + args=args, base_substack_url=BASE_SUBSTACK_URL, md_save_dir=args.directory, html_save_dir=args.html_directory From 15fc25c71720c2afba4b3bafceabb91b1ed672bb Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 07:55:10 +0100 Subject: [PATCH 03/11] add parameters: assets-dir author-template --- src/substack2markdown/substack_scraper.py | 73 +++++++++++------------ 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 734dc553..297f0937 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -26,6 +26,7 @@ BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files +ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts @@ -37,12 +38,12 @@ def extract_main_part(url: str) -> str: # present -def generate_html_file(author_name: str) -> None: +def generate_html_file(args, author_name: str) -> None: """ Generates a HTML file for the given author. """ - if not os.path.exists(BASE_HTML_DIR): - os.makedirs(BASE_HTML_DIR) + if not os.path.exists(args.html_directory): + os.makedirs(args.html_directory) # Read JSON data json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json') @@ -52,7 +53,7 @@ def generate_html_file(author_name: str) -> None: # Convert JSON data to a JSON string for embedding embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4) - with open(HTML_TEMPLATE, 'r', encoding='utf-8') as file: + with open(args.author_template, 'r', encoding='utf-8') as file: html_template = file.read() # Insert the JSON string into the script tag in the HTML template @@ -63,7 +64,7 @@ def generate_html_file(author_name: str) -> None: html_with_author = html_with_data.replace('author_name', author_name) # Write the modified HTML to a new file - html_output_path = os.path.join(BASE_HTML_DIR, f'{author_name}.html') + html_output_path = os.path.join(args.html_directory, f'{author_name}.html') with open(html_output_path, 'w', encoding='utf-8') as file: file.write(html_with_author) @@ -193,7 +194,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None: # Calculate the relative path from the HTML file to the CSS file html_dir = os.path.dirname(filepath) - css_path = os.path.relpath("./assets/css/essay-styles.css", html_dir) + css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir) css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths html_content = f""" @@ -367,7 +368,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: if num_posts_to_scrape != 0 and count == num_posts_to_scrape: break self.save_essays_data_to_json(essays_data=essays_data) - generate_html_file(author_name=self.writer_name) + generate_html_file(self.args, author_name=self.writer_name) class SubstackScraper(BaseSubstackScraper): @@ -505,14 +506,22 @@ def parse_args() -> argparse.Namespace: "--password", type=str, help="Login password." ) parser.add_argument( - "-u", "--url", type=str, help="The base URL of the Substack site to scrape." + "-u", + "--url", # args.url + type=str, + default=BASE_SUBSTACK_URL, + help="The base URL of the Substack site to scrape." ) parser.add_argument( - "-d", "--directory", type=str, help="The directory to save scraped posts." + "-d", + "--directory", # args.directory + type=str, + default=BASE_MD_DIR, + help="The directory to save scraped posts." ) parser.add_argument( "-n", - "--number", + "--number", # args.number type=int, default=0, help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.", @@ -523,6 +532,15 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Include -p in command to use the Premium Substack Scraper with selenium.", ) + parser.add_argument( + "--assets-dir", # args.assets_dir + default=ASSETS_DIR, + help=f"Path to assets directory. Default: {ASSETS_DIR!r}", + ) + parser.add_argument( + "--author-template", # args.author_template + help=f"Path to author_template.html. Default: {repr('{assets_dir}/' + HTML_TEMPLATE)}", + ) parser.add_argument( "--headless", action="store_true", @@ -549,9 +567,10 @@ def parse_args() -> argparse.Namespace: "passing captcha in headless mode", ) parser.add_argument( - "--html-directory", + "--html-directory", # args.html_directory type=str, - help="The directory to save scraped posts as HTML files.", + default=BASE_HTML_DIR, + help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}", ) return parser.parse_args() @@ -560,12 +579,6 @@ def parse_args() -> argparse.Namespace: def main(): args = parse_args() - if args.directory is None: - args.directory = BASE_MD_DIR - - if args.html_directory is None: - args.html_directory = BASE_HTML_DIR - if args.config: with open(args.config) as f: config = json.load(f) @@ -576,7 +589,10 @@ def main(): assert args.email assert args.password - if args.url: + if not args.author_template: + args.author_template = args.assets_dir + "/" + HTML_TEMPLATE + + if True: if args.premium: scraper = PremiumSubstackScraper( args=args, @@ -594,25 +610,6 @@ def main(): ) scraper.scrape_posts(args.number) - else: # Use the hardcoded values at the top of the file - if USE_PREMIUM: - scraper = PremiumSubstackScraper( - args=args, - base_substack_url=BASE_SUBSTACK_URL, - md_save_dir=args.directory, - html_save_dir=args.html_directory, - edge_path=args.edge_path, - edge_driver_path=args.edge_driver_path - ) - else: - scraper = SubstackScraper( - args=args, - base_substack_url=BASE_SUBSTACK_URL, - md_save_dir=args.directory, - html_save_dir=args.html_directory - ) - scraper.scrape_posts(num_posts_to_scrape=NUM_POSTS_TO_SCRAPE) - if __name__ == "__main__": main() From 8d7676f947cfad94cf32eb013ed12f32c2bc90f8 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 10:05:02 +0100 Subject: [PATCH 04/11] use selenium_driverless --- requirements.txt | 3 +- src/substack2markdown/substack_scraper.py | 205 ++++++++++++---------- 2 files changed, 113 insertions(+), 95 deletions(-) diff --git a/requirements.txt b/requirements.txt index c58926a7..af704d1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ bs4==0.0.1 html2text==2020.1.16 requests==2.31.0 -selenium==4.16.0 +selenium-driverless tqdm==4.66.1 -webdriver_manager==4.0.1 Markdown==3.6 diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 297f0937..0f22adc5 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -4,7 +4,9 @@ from abc import ABC, abstractmethod from typing import List, Optional, Tuple from time import sleep - +import asyncio +import atexit +import signal import html2text import markdown @@ -14,12 +16,8 @@ from tqdm import tqdm from xml.etree import ElementTree as ET -from selenium import webdriver -from selenium.webdriver.common.by import By -from webdriver_manager.microsoft import EdgeChromiumDriverManager -from selenium.webdriver.edge.options import Options as EdgeOptions -from selenium.common.exceptions import SessionNotCreatedException -from selenium.webdriver.chrome.service import Service +from selenium_driverless import webdriver +from selenium_driverless.types.by import By from urllib.parse import urlparse USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts @@ -70,6 +68,15 @@ def generate_html_file(args, author_name: str) -> None: class BaseSubstackScraper(ABC): + def __await__(self): + return self._async_init().__await__() + + async def __aenter__(self): + return await self + + async def __aexit__(self, exc_type, exc, tb): + pass + def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): if not base_substack_url.endswith("/"): base_substack_url += "/" @@ -92,6 +99,10 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() + async def _async_init(self): + self._loop = asyncio.get_running_loop() + return self + def get_all_post_urls(self) -> List[str]: """ Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary. @@ -326,7 +337,7 @@ def save_essays_data_to_json(self, essays_data: list) -> None: with open(json_path, 'w', encoding='utf-8') as f: json.dump(essays_data, f, ensure_ascii=False, indent=4) - def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: + async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: """ Iterates over all posts and saves them as markdown and html files """ @@ -340,8 +351,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: md_filepath = os.path.join(self.md_save_dir, md_filename) html_filepath = os.path.join(self.html_save_dir, html_filename) - if not os.path.exists(md_filepath): - soup = self.get_url_soup(url) + soup = await self.get_url_soup(url) if soup is None: total += 1 continue @@ -398,100 +408,109 @@ def __init__( md_save_dir: str, html_save_dir: str, headless: bool = False, - edge_path: str = '', - edge_driver_path: str = '', + chromium_path: str = '', user_agent: str = '' ) -> None: super().__init__(args, base_substack_url, md_save_dir, html_save_dir) - options = EdgeOptions() + self.driver = None + + def exit_handler(signum, frame): + print() + print(f"exit_handler: received signal {signum}") + try: + asyncio.get_event_loop().create_task(self._cleanup_sync()) + except Exception: + pass + raise SystemExit(0) + + signal.signal(signal.SIGINT, exit_handler) + signal.signal(signal.SIGTERM, exit_handler) + + atexit.register(self._cleanup_sync) + + options = webdriver.ChromeOptions() + self.chrome_options = options if headless: - # modern headless flag (works better with recent Edge/Chromium) + # modern headless flag (works better with recent Chromium) options.add_argument("--headless=new") - if edge_path: - options.binary_location = edge_path + if chromium_path: + options.binary_location = chromium_path if user_agent: options.add_argument(f"user-agent={user_agent}") - - if isinstance(options, EdgeOptions): - os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://msedgedriver.microsoft.com") - elif isinstance(options, ChromeOptions): - os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://chromedriver.storage.googleapis.com") - - self.driver = None + async def _async_init(self): + self._loop = asyncio.get_running_loop() - # 1) Prefer an explicit driver path (manual download) - if edge_driver_path and os.path.exists(edge_driver_path): - service = Service(executable_path=edge_driver_path) - self.driver = webdriver.Edge(service=service, options=options) - else: - # 2) Try webdriver_manager (needs network/DNS) - try: - service = Service(EdgeChromiumDriverManager().install()) - self.driver = webdriver.Edge(service=service, options=options) - except Exception as e: - print("webdriver_manager could not download msedgedriver (network/DNS). Falling back to Selenium Manager.") - # 3) Selenium Manager fallback (still needs network; but avoids webdriver_manager) + await self._start_driver() + await self.login() + return self + + async def _start_driver(self): + self.driver = await webdriver.Chrome(options=self.chrome_options) + + async def __aexit__(self, exc_type, exc, tb): + await self.close() + + async def close(self) -> None: + if self.driver: + await self.driver.quit() + + def _cleanup_sync(self): + try: + if not self.driver: + return + proc = self.driver._process + if proc and proc.poll() is None: + proc.terminate() try: - # IMPORTANT: ensure no stale driver in PATH (e.g. C:\Windows\msedgedriver.exe v138) - self.driver = webdriver.Edge(options=options) - except SessionNotCreatedException as se: - raise RuntimeError( - "Selenium Manager fallback failed due to driver/browser mismatch.\n" - "Fix by either: (a) removing stale msedgedriver in PATH (e.g. C:\\Windows\\msedgedriver.exe) and replace with a fresh one downloaded from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver, " - "or (b) pass --edge-driver-path to a manually downloaded driver that matches your Edge version." - ) from se + proc.wait(timeout=1) + except Exception: + proc.kill() + except Exception as exc: + print("_cleanup_sync failed:", exc) + + async def login(self): + await self.driver.get("https://substack.com/sign-in") + await asyncio.sleep(2) + + signin = await self.driver.find_element( + By.XPATH, "//a[contains(@class,'login-option')]" + ) + await signin.click() - self.login() + await asyncio.sleep(2) - def login(self) -> None: - """ - This method logs into Substack using Selenium - """ - self.driver.get("https://substack.com/sign-in") - sleep(3) + email = await self.driver.find_element(By.NAME, "email") + password = await self.driver.find_element(By.NAME, "password") - signin_with_password = self.driver.find_element( - By.XPATH, "//a[@class='login-option substack-login__login-option']" + await email.send_keys(self.args.email) + await password.send_keys(self.args.password) + + submit = await self.driver.find_element( + By.XPATH, "//*[@id='substack-login']//form//button" ) - signin_with_password.click() - sleep(3) - - # Email and password - email = self.driver.find_element(By.NAME, "email") - password = self.driver.find_element(By.NAME, "password") - email.send_keys(self.args.email) - password.send_keys(self.args.password) - - # Find the submit button and click it. - submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button") - submit.click() - sleep(30) # Wait for the page to load - - if self.is_login_failed(): - raise Exception( - "Warning: Login unsuccessful. Please check your email and password, or your account status.\n" - "Use the non-premium scraper for the non-paid posts. \n" - "If running headless, run non-headlessly to see if blocked by Captcha." - ) + await submit.click() - def is_login_failed(self) -> bool: + await asyncio.sleep(8) + + if await self.is_login_failed(): + raise RuntimeError("Substack login failed") + + async def is_login_failed(self): """ Check for the presence of the 'error-container' to indicate a failed login attempt. """ - error_container = self.driver.find_elements(By.ID, 'error-container') - return len(error_container) > 0 and error_container[0].is_displayed() + elements = await self.driver.find_elements(By.ID, "error-container") + return bool(elements) - def get_url_soup(self, url: str) -> BeautifulSoup: + async def get_url_soup(self, url: str): """ Gets soup from URL using logged in selenium driver """ - try: - self.driver.get(url) - return BeautifulSoup(self.driver.page_source, "html.parser") - except Exception as e: - raise ValueError(f"Error fetching page: {e}") from e + await self.driver.get(url) + html = await self.driver.page_source + return BeautifulSoup(html, "html.parser") def parse_args() -> argparse.Namespace: @@ -548,16 +567,10 @@ def parse_args() -> argparse.Namespace: "Scraper.", ) parser.add_argument( - "--edge-path", + "--chromium-path", # args.chromium_path type=str, default="", - help='Optional: The path to the Edge browser executable (i.e. "path_to_msedge.exe").', - ) - parser.add_argument( - "--edge-driver-path", - type=str, - default="", - help='Optional: The path to the Edge WebDriver executable (i.e. "path_to_msedgedriver.exe").', + help='Optional: The path to the Chromium browser executable (i.e. "path/to/chromium").', ) parser.add_argument( "--user-agent", @@ -576,7 +589,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def main(): +async def async_main(): args = parse_args() if args.config: @@ -594,7 +607,7 @@ def main(): if True: if args.premium: - scraper = PremiumSubstackScraper( + scraper = await PremiumSubstackScraper( args=args, base_substack_url=args.url, headless=args.headless, @@ -602,13 +615,19 @@ def main(): html_save_dir=args.html_directory ) else: - scraper = SubstackScraper( + scraper = await SubstackScraper( args=args, base_substack_url=args.url, md_save_dir=args.directory, html_save_dir=args.html_directory ) - scraper.scrape_posts(args.number) + + await scraper.scrape_posts(args.number) + await scraper.close() + + +def main(): + asyncio.run(async_main()) if __name__ == "__main__": From 4af8b45d9a477df427a352b5038ea14d983451ce Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 10:07:52 +0100 Subject: [PATCH 05/11] replace existing files --- src/substack2markdown/substack_scraper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 0f22adc5..91c3f2ac 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -178,7 +178,8 @@ def save_to_file(filepath: str, content: str) -> None: if not isinstance(content, str): raise ValueError("content must be a string") - if os.path.exists(filepath): + # if os.path.exists(filepath): + if False: print(f"File already exists: {filepath}") return @@ -351,6 +352,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: md_filepath = os.path.join(self.md_save_dir, md_filename) html_filepath = os.path.join(self.html_save_dir, html_filename) + # if not os.path.exists(md_filepath): + if True: soup = await self.get_url_soup(url) if soup is None: total += 1 From 4b8598af5affd08af4a56e1329f47959f520777c Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 11:15:06 +0100 Subject: [PATCH 06/11] fixup: assets-dir --- src/substack2markdown/substack_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 91c3f2ac..fc3bdabf 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -206,7 +206,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None: # Calculate the relative path from the HTML file to the CSS file html_dir = os.path.dirname(filepath) - css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir) + css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir) css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths html_content = f""" From 5811bb5de77a0302e619936afaa7fbe67d45b307 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 12:34:02 +0100 Subject: [PATCH 07/11] download images based on https://github.com/timf34/Substack2Markdown/pull/26 --- src/substack2markdown/substack_scraper.py | 186 +++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index fc3bdabf..e2cc62ae 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -1,6 +1,13 @@ import argparse import json import os +import io +import re +import base64 +import hashlib +import mimetypes +from pathlib import Path +from urllib.parse import urlparse, unquote from abc import ABC, abstractmethod from typing import List, Optional, Tuple from time import sleep @@ -18,18 +25,54 @@ from selenium_driverless import webdriver from selenium_driverless.types.by import By -from urllib.parse import urlparse USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files +BASE_IMAGE_DIR: str = "substack_images" ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts +def count_images_in_markdown(md_content: str) -> int: + """Count number of Substack CDN image URLs in markdown content.""" + # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) + # regex lookahead: match "...)" but not "...)]" suffix + pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)') + matches = re.findall(pattern, md_content) + return len(matches) + + +def sanitize_image_filename(url: str) -> str: + """Create a safe filename from URL or content.""" + # Extract original filename from CDN URL + if "substackcdn.com" in url: + # Get the actual image URL after the CDN parameters + original_url = unquote(url.split("/https%3A%2F%2F")[1]) + filename = original_url.split("/")[-1] + else: + filename = url.split("/")[-1] + + # Remove invalid characters + filename = re.sub(r'[<>:"/\\|?*]', '', filename) + + # If filename is too long or empty, create hash-based name + if len(filename) > 100 or not filename: + hash_object = hashlib.md5(url.encode()) + ext = mimetypes.guess_extension(requests.head(url).headers.get('content-type', '')) or '.jpg' + filename = f"{hash_object.hexdigest()}{ext}" + + return filename + + +def get_post_slug(url: str) -> str: + match = re.search(r'/p/([^/]+)', url) + return match.group(1) if match else 'unknown_post' + + def extract_main_part(url: str) -> str: parts = urlparse(url).netloc.split('.') # Parse the URL to get the netloc, and split on '.' return parts[1] if parts[0] == 'www' else parts[0] # Return the main part of the domain, while ignoring 'www' if @@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir os.makedirs(self.html_save_dir) print(f"Created html directory {self.html_save_dir}") + if not self.args.no_images: + os.makedirs(self.args.image_directory, exist_ok=True) + self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() @@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: total += 1 continue title, subtitle, like_count, date, md = self.extract_post_data(soup) + + if not self.args.no_images: + total_images = count_images_in_markdown(md) + post_slug = get_post_slug(url) + with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: + md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar) + self.save_to_file(md_filepath, md) # Convert markdown to HTML and save @@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.save_essays_data_to_json(essays_data=essays_data) generate_html_file(self.args, author_name=self.writer_name) + async def download_image( + self, + url: str, + save_path: Path, + pbar: Optional[tqdm] = None + ) -> Optional[str]: + """Download image from URL and save to path.""" + try: + response = requests.get(url, stream=True) + if response.status_code == 200: + save_path.parent.mkdir(parents=True, exist_ok=True) + with open(save_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + if pbar: + pbar.update(1) + return str(save_path) + except Exception as exc: + if pbar: + pbar.write(f"Error downloading image {url}: {str(exc)}") + # raise exc # debug + return None + + async def process_markdown_images( + self, + md_content: str, + author: str, + post_slug: str, + pbar=None + ) -> str: + """Process markdown content to download images and update references.""" + image_dir = Path(self.args.image_directory) / author / post_slug + # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png) + pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)') + buf = io.StringIO() + last_end = 0 + for match in pattern.finditer(md_content): + buf.write(md_content[last_end:match.start()]) + url = match.group(0).strip("()") + filename = sanitize_image_filename(url) + save_path = image_dir / filename + if not save_path.exists(): + await self.download_image(url, save_path, pbar) + rel_path = os.path.relpath(save_path, Path(self.args.directory) / author) + buf.write(f"({rel_path})") + last_end = match.end() + buf.write(md_content[last_end:]) + return buf.getvalue() + class SubstackScraper(BaseSubstackScraper): def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str): @@ -515,6 +618,76 @@ async def get_url_soup(self, url: str): html = await self.driver.page_source return BeautifulSoup(html, "html.parser") + async def download_image_FIXME( + self, + url: str, + save_path: Path, + pbar: Optional[tqdm] = None + ) -> Optional[str]: + """Download image using selenium_driverless""" + + # NOTE for now this works with the default "def download_image" + + # WONTFIX "fetch" fails due to CORS policy + + # WONTFIX "canvas" does not return the original image bytes + + # we could fetch images with CDP Network.getResponseBody + # but that requires lots of boilerplate code + # fix: use https://github.com/milahu/aiohttp_chromium + + try: + # Execute JS fetch inside browser + result = await self.driver.execute_async_script( + """ + const url = arguments[0]; + const callback = arguments[arguments.length - 1]; + + const img = new Image(); + img.crossOrigin = 'Anonymous'; // try to avoid CORS issues + img.onload = () => { + try { + const canvas = document.createElement('canvas'); + canvas.width = img.width; + canvas.height = img.height; + const ctx = canvas.getContext('2d'); + ctx.drawImage(img, 0, 0); + const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..." + const base64 = dataUrl.split(',')[1]; // strip prefix + callback({data: base64}); + } catch (err) { + callback({error: err.message, stack: err.stack}); + } + }; + img.onerror = (err) => { + callback({error: 'Image load error', stack: err.toString()}); + }; + img.src = url; + """, + url + ) + + if isinstance(result, dict) and "error" in result: + raise RuntimeError(f"{result['error']}\nJS stack:\n{result['stack']}") + + # Decode base64 to bytes + image_bytes = base64.b64decode(result) + + save_path.parent.mkdir(parents=True, exist_ok=True) + with open(save_path, "wb") as f: + f.write(image_bytes) + + if pbar: + pbar.update(1) + + return str(save_path) + + except Exception as exc: + if pbar: + pbar.write(f"Error downloading image {url}: {exc}") + # raise exc # debug + return None + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Scrape a Substack site.") @@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace: default=BASE_HTML_DIR, help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}", ) + parser.add_argument( + "--image-directory", # args.image_directory + type=str, + default=BASE_IMAGE_DIR, + help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}", + ) + parser.add_argument( + "--no-images", # args.no_images + action="store_true", + help=f"Do not download images.", + ) return parser.parse_args() From 153746f5b76d664526db9580259d3be6aa4bcd59 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 18:38:46 +0100 Subject: [PATCH 08/11] download comments fix https://github.com/timf34/Substack2Markdown/issues/3 --- src/substack2markdown/substack_scraper.py | 129 ++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index e2cc62ae..d2f2ad3a 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -363,6 +363,98 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st return title, subtitle, like_count, date, md_content + async def get_window_preloads(self, soup): + # all comments are stored in javascript + # + # only some comments are rendered in html + # with buttons to "Expand full comment" and "Load More" + # see also + # https://www.selfpublife.com/p/automatically-expand-all-substack-comments + window_preloads = None + for script_element in soup.select("script"): + script_text = script_element.text.strip() + if not script_text.startswith("window._preloads"): + continue + # pos1 = re.search(r'window._preloads\s*=\s*JSON\.parse\(', script_text).span()[1] + pos1 = script_text.find("(") + 1 + pos2 = script_text.rfind(")") + window_preloads = json.loads(json.loads(script_text[pos1:pos2])) + break + assert window_preloads, f"not found at {url!r}" + return window_preloads + + def count_comments(self, comments_preloads): + + def count_comments_inner(comment): + res = 1 + for child_comment in comment["children"]: + res += count_comments_inner(child_comment) + return res + + res = 0 + for comment in comments_preloads["initialComments"]: + res += count_comments_inner(comment) + return res + + def render_comments_html(self, comments_preloads): + + def render_comment_body(body): + body = body.strip() + body = "

" + body + "

" + body = body.replace("\n", "

\n

") + # TODO more? + return body + + def render_comments_html_inner(comment, buf): + assert comment["type"] == "comment", f'unexpected comment type: {comment["type"]!r}' + buf.write(f'

\n') + buf.write(f'\n') + + # NOTE user IDs are constant, user handles are variable + # when i change my user handle + # then other users can use my old user handle + buf.write(f'') + buf.write(comment["name"]) # human-readable username + buf.write('\n') + + other_pub = comment["metadata"].get("author_on_other_pub") + if other_pub: + # NOTE publication handles are quasi-constant: + # when i change my publication handle + # then other users cannot use my old publication handle + # NOTE "Changing your publication's subdomain + # does not automatically set up a redirect from the old subdomain to the new one." + buf.write(f'(') + buf.write(other_pub["name"]) + buf.write(')\n') + + buf.write(comment["date"] + '\n') # "2025-05-17T06:51:39.485Z" + + for reaction, reaction_count in comment["reactions"].items(): + if reaction_count == 0: continue + buf.write(reaction + str(reaction_count) + '\n') # "❤123" + # buf.write(str(reaction_count) + reaction + '\n') # "123❤" + + buf.write('\n') + + buf.write('
\n') + buf.write('\n') + buf.write(render_comment_body(comment["body"]) + '\n') + + for child_comment in comment["children"]: + buf.write('\n') + render_comments_html_inner(child_comment, buf) + buf.write('
\n') + + buf.write('
\n') + buf.write('\n') + + buf = io.StringIO() + # NOTE the name "initial" is misleading. all comments are stored in this array + # NOTE comments are sorted by likes + for comment in comments_preloads["initialComments"]: + render_comments_html_inner(comment, buf) + return buf.getvalue() @abstractmethod def get_url_soup(self, url: str) -> str: @@ -412,6 +504,37 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar: md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar) + comments_html = None + comments_num = None + if not self.args.no_comments: + comments_url = url + "/comments" + # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test + comments_soup = await self.get_url_soup(comments_url) + comments_preloads = await self.get_window_preloads(comments_soup) + if 0: + # debug + # TODO add option to write the original "preloads" data to json files + with open("comments_preloads.json", "w") as f: + json.dump(comments_preloads, f, indent=2) + raise 5 + comments_num = self.count_comments(comments_preloads) + if comments_num > 0: + comments_html = self.render_comments_html(comments_preloads) + comments_html = ( + '\n\n' + + '
\n' + + # this can collide with other elements with id="comments" + # '
\n' + + '
\n' + + '

Comments

\n' + + '
\n' + + f'{comments_num} comments\n' + + comments_html + '\n' + + '
' + '
' + ) + md += comments_html + self.save_to_file(md_filepath, md) # Convert markdown to HTML and save @@ -422,6 +545,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: "title": title, "subtitle": subtitle, "like_count": like_count, + "comment_count": comments_num, "date": date, "file_link": md_filepath, "html_link": html_filepath @@ -772,6 +896,11 @@ def parse_args() -> argparse.Namespace: action="store_true", help=f"Do not download images.", ) + parser.add_argument( + "--no-comments", # args.no_comments + action="store_true", + help=f"Do not download comments.", + ) return parser.parse_args() From 591fa86b49e6f38d802c741e3d8b8f10365c2cf8 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 22:37:11 +0100 Subject: [PATCH 09/11] handle removed comments --- src/substack2markdown/substack_scraper.py | 32 ++++++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index d2f2ad3a..27890458 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -413,9 +413,19 @@ def render_comments_html_inner(comment, buf): # NOTE user IDs are constant, user handles are variable # when i change my user handle # then other users can use my old user handle - buf.write(f'') - buf.write(comment["name"]) # human-readable username - buf.write('\n') + if not comment["user_id"] is None: + buf.write(f'') + + if not comment["name"] is None: + buf.write(comment["name"]) # human-readable username + else: + # Comment removed + buf.write("null") + + if not comment["user_id"] is None: + buf.write('\n') + else: + buf.write('\n') other_pub = comment["metadata"].get("author_on_other_pub") if other_pub: @@ -439,7 +449,21 @@ def render_comments_html_inner(comment, buf): buf.write('
\n') buf.write('\n') - buf.write(render_comment_body(comment["body"]) + '\n') + + if comment["body"] is None: + # Comment removed + status = comment.get("status") + if status is None: + buf.write('(Comment removed)\n') + else: + # "moderator_removed", ... + buf.write('(status:' + status + ')\n') + # TODO comment["bans"] + # TODO comment["suppressed"] + # TODO comment["user_banned"] + # TODO comment["user_banned_for_comment"] + else: + buf.write(render_comment_body(comment["body"]) + '\n') for child_comment in comment["children"]: buf.write('\n') From 1458d78ea4828be288c73abdf5462641047813b2 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 22:37:48 +0100 Subject: [PATCH 10/11] add debug comment --- src/substack2markdown/substack_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 27890458..2a1c2727 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -578,6 +578,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: print(f"File already exists: {md_filepath}") except Exception as e: print(f"Error scraping post: {e}") + # raise e # debug count += 1 if num_posts_to_scrape != 0 and count == num_posts_to_scrape: break From 07e4c1d1581d81d63ff6ac7624da03fa513047d7 Mon Sep 17 00:00:00 2001 From: Milan Hauth Date: Sun, 28 Dec 2025 23:26:21 +0100 Subject: [PATCH 11/11] write JSON files --- src/substack2markdown/substack_scraper.py | 35 +++++++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py index 2a1c2727..d31ca8dd 100644 --- a/src/substack2markdown/substack_scraper.py +++ b/src/substack2markdown/substack_scraper.py @@ -31,6 +31,7 @@ BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files BASE_IMAGE_DIR: str = "substack_images" +BASE_JSON_DIR: str = "substack_json" ASSETS_DIR: str = os.path.dirname(__file__) + "/assets" HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" @@ -132,6 +133,8 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir self.md_save_dir: str = md_save_dir self.html_save_dir: str = f"{html_save_dir}/{self.writer_name}" + self.args.json_directory += f"/{self.writer_name}" + if not os.path.exists(md_save_dir): os.makedirs(md_save_dir) print(f"Created md directory {md_save_dir}") @@ -142,6 +145,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir if not self.args.no_images: os.makedirs(self.args.image_directory, exist_ok=True) + if not self.args.no_json: + os.makedirs(self.args.json_directory, exist_ok=True) + self.keywords: List[str] = ["about", "archive", "podcast"] self.post_urls: List[str] = self.get_all_post_urls() @@ -535,12 +541,11 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test comments_soup = await self.get_url_soup(comments_url) comments_preloads = await self.get_window_preloads(comments_soup) - if 0: - # debug - # TODO add option to write the original "preloads" data to json files - with open("comments_preloads.json", "w") as f: - json.dump(comments_preloads, f, indent=2) - raise 5 + if not self.args.no_json: + json_filename = self.get_filename_from_url(url, filetype=".comments.json") + json_filepath = os.path.join(self.args.json_directory, json_filename) + _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':')) + self.save_to_file(json_filepath, _json) comments_num = self.count_comments(comments_preloads) if comments_num > 0: comments_html = self.render_comments_html(comments_preloads) @@ -561,6 +566,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None: self.save_to_file(md_filepath, md) + if not self.args.no_json: + post_preloads = await self.get_window_preloads(soup) + json_filename = self.get_filename_from_url(url, filetype=".post.json") + json_filepath = os.path.join(self.args.json_directory, json_filename) + _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':')) + self.save_to_file(json_filepath, _json) + # Convert markdown to HTML and save html_content = self.md_to_html(md) self.save_to_html_file(html_filepath, html_content) @@ -916,6 +928,12 @@ def parse_args() -> argparse.Namespace: default=BASE_IMAGE_DIR, help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}", ) + parser.add_argument( + "--json-directory", # args.json_directory + type=str, + default=BASE_JSON_DIR, + help=f"The directory to save scraped JSON files. Default: {BASE_JSON_DIR!r}", + ) parser.add_argument( "--no-images", # args.no_images action="store_true", @@ -926,6 +944,11 @@ def parse_args() -> argparse.Namespace: action="store_true", help=f"Do not download comments.", ) + parser.add_argument( + "--no-json", # args.no_json + action="store_true", + help=f"Do not write JSON files.", + ) return parser.parse_args()