From c520fb3a8d64a6dde29d6f8d269213f9a14fbd6e Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sat, 27 Dec 2025 18:39:38 +0100
Subject: [PATCH 01/11] add setup.py
---
config.py | 2 --
data/README.md | 3 ---
src/substack2markdown/__init__.py | 0
.../substack2markdown/assets/author_template.html | 0
.../substack2markdown/assets}/css/essay-styles.css | 0
.../substack2markdown/assets}/css/style.css | 0
.../substack2markdown/assets}/images/screenshot.png | Bin
.../substack2markdown/assets}/js/populate-essays.js | 0
.../substack2markdown/substack_scraper.py | 0
substack_html_pages/README.md | 3 ---
10 files changed, 8 deletions(-)
delete mode 100644 config.py
delete mode 100644 data/README.md
create mode 100644 src/substack2markdown/__init__.py
rename author_template.html => src/substack2markdown/assets/author_template.html (100%)
rename {assets => src/substack2markdown/assets}/css/essay-styles.css (100%)
rename {assets => src/substack2markdown/assets}/css/style.css (100%)
rename {assets => src/substack2markdown/assets}/images/screenshot.png (100%)
rename {assets => src/substack2markdown/assets}/js/populate-essays.js (100%)
rename substack_scraper.py => src/substack2markdown/substack_scraper.py (100%)
delete mode 100644 substack_html_pages/README.md
diff --git a/config.py b/config.py
deleted file mode 100644
index 8fc6bff2..00000000
--- a/config.py
+++ /dev/null
@@ -1,2 +0,0 @@
-EMAIL = "your-email@domain.com"
-PASSWORD = "your-password"
diff --git a/data/README.md b/data/README.md
deleted file mode 100644
index 27476ca6..00000000
--- a/data/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory will be used to store `.json` files for each writer
-containing metadata that is used to populate a `.html` file for that
-author.
\ No newline at end of file
diff --git a/src/substack2markdown/__init__.py b/src/substack2markdown/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/author_template.html b/src/substack2markdown/assets/author_template.html
similarity index 100%
rename from author_template.html
rename to src/substack2markdown/assets/author_template.html
diff --git a/assets/css/essay-styles.css b/src/substack2markdown/assets/css/essay-styles.css
similarity index 100%
rename from assets/css/essay-styles.css
rename to src/substack2markdown/assets/css/essay-styles.css
diff --git a/assets/css/style.css b/src/substack2markdown/assets/css/style.css
similarity index 100%
rename from assets/css/style.css
rename to src/substack2markdown/assets/css/style.css
diff --git a/assets/images/screenshot.png b/src/substack2markdown/assets/images/screenshot.png
similarity index 100%
rename from assets/images/screenshot.png
rename to src/substack2markdown/assets/images/screenshot.png
diff --git a/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js
similarity index 100%
rename from assets/js/populate-essays.js
rename to src/substack2markdown/assets/js/populate-essays.js
diff --git a/substack_scraper.py b/src/substack2markdown/substack_scraper.py
similarity index 100%
rename from substack_scraper.py
rename to src/substack2markdown/substack_scraper.py
diff --git a/substack_html_pages/README.md b/substack_html_pages/README.md
deleted file mode 100644
index 0931cf8e..00000000
--- a/substack_html_pages/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory will be used to store `.html` files for each writer that will enable you
-to browse and sort the downloaded markdown files for a given writer. One `.html` file
-will be created for each writer.
\ No newline at end of file
From c73855e0a8aa17b916863c895acf8d6e741a2cf3 Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sat, 27 Dec 2025 19:05:33 +0100
Subject: [PATCH 02/11] add parameters: config email password
---
src/substack2markdown/substack_scraper.py | 42 ++++++++++++++++++-----
1 file changed, 33 insertions(+), 9 deletions(-)
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 126d260d..734dc553 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -21,7 +21,6 @@
from selenium.common.exceptions import SessionNotCreatedException
from selenium.webdriver.chrome.service import Service
from urllib.parse import urlparse
-from config import EMAIL, PASSWORD
USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts
BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown
@@ -70,9 +69,10 @@ def generate_html_file(author_name: str) -> None:
class BaseSubstackScraper(ABC):
- def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str):
+ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
if not base_substack_url.endswith("/"):
base_substack_url += "/"
+ self.args = args
self.base_substack_url: str = base_substack_url
self.writer_name: str = extract_main_part(base_substack_url)
@@ -371,8 +371,8 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
class SubstackScraper(BaseSubstackScraper):
- def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str):
- super().__init__(base_substack_url, md_save_dir, html_save_dir)
+ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
+ super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
"""
@@ -392,6 +392,7 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
class PremiumSubstackScraper(BaseSubstackScraper):
def __init__(
self,
+ args,
base_substack_url: str,
md_save_dir: str,
html_save_dir: str,
@@ -400,7 +401,7 @@ def __init__(
edge_driver_path: str = '',
user_agent: str = ''
) -> None:
- super().__init__(base_substack_url, md_save_dir, html_save_dir)
+ super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
options = EdgeOptions()
if headless:
@@ -459,8 +460,8 @@ def login(self) -> None:
# Email and password
email = self.driver.find_element(By.NAME, "email")
password = self.driver.find_element(By.NAME, "password")
- email.send_keys(EMAIL)
- password.send_keys(PASSWORD)
+ email.send_keys(self.args.email)
+ password.send_keys(self.args.password)
# Find the submit button and click it.
submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button")
@@ -494,6 +495,15 @@ def get_url_soup(self, url: str) -> BeautifulSoup:
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Scrape a Substack site.")
+ parser.add_argument(
+ "--config", type=str, help="JSON config file with email and password."
+ )
+ parser.add_argument(
+ "--email", type=str, help="Login E-Mail."
+ )
+ parser.add_argument(
+ "--password", type=str, help="Login password."
+ )
parser.add_argument(
"-u", "--url", type=str, help="The base URL of the Substack site to scrape."
)
@@ -556,17 +566,29 @@ def main():
if args.html_directory is None:
args.html_directory = BASE_HTML_DIR
+ if args.config:
+ with open(args.config) as f:
+ config = json.load(f)
+ args.email = config["email"]
+ args.password = config["password"]
+ # TODO more
+
+ assert args.email
+ assert args.password
+
if args.url:
if args.premium:
scraper = PremiumSubstackScraper(
- args.url,
+ args=args,
+ base_substack_url=args.url,
headless=args.headless,
md_save_dir=args.directory,
html_save_dir=args.html_directory
)
else:
scraper = SubstackScraper(
- args.url,
+ args=args,
+ base_substack_url=args.url,
md_save_dir=args.directory,
html_save_dir=args.html_directory
)
@@ -575,6 +597,7 @@ def main():
else: # Use the hardcoded values at the top of the file
if USE_PREMIUM:
scraper = PremiumSubstackScraper(
+ args=args,
base_substack_url=BASE_SUBSTACK_URL,
md_save_dir=args.directory,
html_save_dir=args.html_directory,
@@ -583,6 +606,7 @@ def main():
)
else:
scraper = SubstackScraper(
+ args=args,
base_substack_url=BASE_SUBSTACK_URL,
md_save_dir=args.directory,
html_save_dir=args.html_directory
From 15fc25c71720c2afba4b3bafceabb91b1ed672bb Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sun, 28 Dec 2025 07:55:10 +0100
Subject: [PATCH 03/11] add parameters: assets-dir author-template
---
src/substack2markdown/substack_scraper.py | 73 +++++++++++------------
1 file changed, 35 insertions(+), 38 deletions(-)
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 734dc553..297f0937 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -26,6 +26,7 @@
BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown
BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files
BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files
+ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page
JSON_DATA_DIR: str = "data"
NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts
@@ -37,12 +38,12 @@ def extract_main_part(url: str) -> str:
# present
-def generate_html_file(author_name: str) -> None:
+def generate_html_file(args, author_name: str) -> None:
"""
Generates a HTML file for the given author.
"""
- if not os.path.exists(BASE_HTML_DIR):
- os.makedirs(BASE_HTML_DIR)
+ if not os.path.exists(args.html_directory):
+ os.makedirs(args.html_directory)
# Read JSON data
json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json')
@@ -52,7 +53,7 @@ def generate_html_file(author_name: str) -> None:
# Convert JSON data to a JSON string for embedding
embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4)
- with open(HTML_TEMPLATE, 'r', encoding='utf-8') as file:
+ with open(args.author_template, 'r', encoding='utf-8') as file:
html_template = file.read()
# Insert the JSON string into the script tag in the HTML template
@@ -63,7 +64,7 @@ def generate_html_file(author_name: str) -> None:
html_with_author = html_with_data.replace('author_name', author_name)
# Write the modified HTML to a new file
- html_output_path = os.path.join(BASE_HTML_DIR, f'{author_name}.html')
+ html_output_path = os.path.join(args.html_directory, f'{author_name}.html')
with open(html_output_path, 'w', encoding='utf-8') as file:
file.write(html_with_author)
@@ -193,7 +194,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
# Calculate the relative path from the HTML file to the CSS file
html_dir = os.path.dirname(filepath)
- css_path = os.path.relpath("./assets/css/essay-styles.css", html_dir)
+ css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir)
css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths
html_content = f"""
@@ -367,7 +368,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
break
self.save_essays_data_to_json(essays_data=essays_data)
- generate_html_file(author_name=self.writer_name)
+ generate_html_file(self.args, author_name=self.writer_name)
class SubstackScraper(BaseSubstackScraper):
@@ -505,14 +506,22 @@ def parse_args() -> argparse.Namespace:
"--password", type=str, help="Login password."
)
parser.add_argument(
- "-u", "--url", type=str, help="The base URL of the Substack site to scrape."
+ "-u",
+ "--url", # args.url
+ type=str,
+ default=BASE_SUBSTACK_URL,
+ help="The base URL of the Substack site to scrape."
)
parser.add_argument(
- "-d", "--directory", type=str, help="The directory to save scraped posts."
+ "-d",
+ "--directory", # args.directory
+ type=str,
+ default=BASE_MD_DIR,
+ help="The directory to save scraped posts."
)
parser.add_argument(
"-n",
- "--number",
+ "--number", # args.number
type=int,
default=0,
help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.",
@@ -523,6 +532,15 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="Include -p in command to use the Premium Substack Scraper with selenium.",
)
+ parser.add_argument(
+ "--assets-dir", # args.assets_dir
+ default=ASSETS_DIR,
+ help=f"Path to assets directory. Default: {ASSETS_DIR!r}",
+ )
+ parser.add_argument(
+ "--author-template", # args.author_template
+ help=f"Path to author_template.html. Default: {repr('{assets_dir}/' + HTML_TEMPLATE)}",
+ )
parser.add_argument(
"--headless",
action="store_true",
@@ -549,9 +567,10 @@ def parse_args() -> argparse.Namespace:
"passing captcha in headless mode",
)
parser.add_argument(
- "--html-directory",
+ "--html-directory", # args.html_directory
type=str,
- help="The directory to save scraped posts as HTML files.",
+ default=BASE_HTML_DIR,
+ help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
)
return parser.parse_args()
@@ -560,12 +579,6 @@ def parse_args() -> argparse.Namespace:
def main():
args = parse_args()
- if args.directory is None:
- args.directory = BASE_MD_DIR
-
- if args.html_directory is None:
- args.html_directory = BASE_HTML_DIR
-
if args.config:
with open(args.config) as f:
config = json.load(f)
@@ -576,7 +589,10 @@ def main():
assert args.email
assert args.password
- if args.url:
+ if not args.author_template:
+ args.author_template = args.assets_dir + "/" + HTML_TEMPLATE
+
+ if True:
if args.premium:
scraper = PremiumSubstackScraper(
args=args,
@@ -594,25 +610,6 @@ def main():
)
scraper.scrape_posts(args.number)
- else: # Use the hardcoded values at the top of the file
- if USE_PREMIUM:
- scraper = PremiumSubstackScraper(
- args=args,
- base_substack_url=BASE_SUBSTACK_URL,
- md_save_dir=args.directory,
- html_save_dir=args.html_directory,
- edge_path=args.edge_path,
- edge_driver_path=args.edge_driver_path
- )
- else:
- scraper = SubstackScraper(
- args=args,
- base_substack_url=BASE_SUBSTACK_URL,
- md_save_dir=args.directory,
- html_save_dir=args.html_directory
- )
- scraper.scrape_posts(num_posts_to_scrape=NUM_POSTS_TO_SCRAPE)
-
if __name__ == "__main__":
main()
From 8d7676f947cfad94cf32eb013ed12f32c2bc90f8 Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sun, 28 Dec 2025 10:05:02 +0100
Subject: [PATCH 04/11] use selenium_driverless
---
requirements.txt | 3 +-
src/substack2markdown/substack_scraper.py | 205 ++++++++++++----------
2 files changed, 113 insertions(+), 95 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index c58926a7..af704d1a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
bs4==0.0.1
html2text==2020.1.16
requests==2.31.0
-selenium==4.16.0
+selenium-driverless
tqdm==4.66.1
-webdriver_manager==4.0.1
Markdown==3.6
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 297f0937..0f22adc5 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -4,7 +4,9 @@
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple
from time import sleep
-
+import asyncio
+import atexit
+import signal
import html2text
import markdown
@@ -14,12 +16,8 @@
from tqdm import tqdm
from xml.etree import ElementTree as ET
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from webdriver_manager.microsoft import EdgeChromiumDriverManager
-from selenium.webdriver.edge.options import Options as EdgeOptions
-from selenium.common.exceptions import SessionNotCreatedException
-from selenium.webdriver.chrome.service import Service
+from selenium_driverless import webdriver
+from selenium_driverless.types.by import By
from urllib.parse import urlparse
USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts
@@ -70,6 +68,15 @@ def generate_html_file(args, author_name: str) -> None:
class BaseSubstackScraper(ABC):
+ def __await__(self):
+ return self._async_init().__await__()
+
+ async def __aenter__(self):
+ return await self
+
+ async def __aexit__(self, exc_type, exc, tb):
+ pass
+
def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
if not base_substack_url.endswith("/"):
base_substack_url += "/"
@@ -92,6 +99,10 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
self.keywords: List[str] = ["about", "archive", "podcast"]
self.post_urls: List[str] = self.get_all_post_urls()
+ async def _async_init(self):
+ self._loop = asyncio.get_running_loop()
+ return self
+
def get_all_post_urls(self) -> List[str]:
"""
Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary.
@@ -326,7 +337,7 @@ def save_essays_data_to_json(self, essays_data: list) -> None:
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(essays_data, f, ensure_ascii=False, indent=4)
- def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
+ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
"""
Iterates over all posts and saves them as markdown and html files
"""
@@ -340,8 +351,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
md_filepath = os.path.join(self.md_save_dir, md_filename)
html_filepath = os.path.join(self.html_save_dir, html_filename)
- if not os.path.exists(md_filepath):
- soup = self.get_url_soup(url)
+ soup = await self.get_url_soup(url)
if soup is None:
total += 1
continue
@@ -398,100 +408,109 @@ def __init__(
md_save_dir: str,
html_save_dir: str,
headless: bool = False,
- edge_path: str = '',
- edge_driver_path: str = '',
+ chromium_path: str = '',
user_agent: str = ''
) -> None:
super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
- options = EdgeOptions()
+ self.driver = None
+
+ def exit_handler(signum, frame):
+ print()
+ print(f"exit_handler: received signal {signum}")
+ try:
+ asyncio.get_event_loop().create_task(self._cleanup_sync())
+ except Exception:
+ pass
+ raise SystemExit(0)
+
+ signal.signal(signal.SIGINT, exit_handler)
+ signal.signal(signal.SIGTERM, exit_handler)
+
+ atexit.register(self._cleanup_sync)
+
+ options = webdriver.ChromeOptions()
+ self.chrome_options = options
if headless:
- # modern headless flag (works better with recent Edge/Chromium)
+ # modern headless flag (works better with recent Chromium)
options.add_argument("--headless=new")
- if edge_path:
- options.binary_location = edge_path
+ if chromium_path:
+ options.binary_location = chromium_path
if user_agent:
options.add_argument(f"user-agent={user_agent}")
-
- if isinstance(options, EdgeOptions):
- os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://msedgedriver.microsoft.com")
- elif isinstance(options, ChromeOptions):
- os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://chromedriver.storage.googleapis.com")
-
- self.driver = None
+ async def _async_init(self):
+ self._loop = asyncio.get_running_loop()
- # 1) Prefer an explicit driver path (manual download)
- if edge_driver_path and os.path.exists(edge_driver_path):
- service = Service(executable_path=edge_driver_path)
- self.driver = webdriver.Edge(service=service, options=options)
- else:
- # 2) Try webdriver_manager (needs network/DNS)
- try:
- service = Service(EdgeChromiumDriverManager().install())
- self.driver = webdriver.Edge(service=service, options=options)
- except Exception as e:
- print("webdriver_manager could not download msedgedriver (network/DNS). Falling back to Selenium Manager.")
- # 3) Selenium Manager fallback (still needs network; but avoids webdriver_manager)
+ await self._start_driver()
+ await self.login()
+ return self
+
+ async def _start_driver(self):
+ self.driver = await webdriver.Chrome(options=self.chrome_options)
+
+ async def __aexit__(self, exc_type, exc, tb):
+ await self.close()
+
+ async def close(self) -> None:
+ if self.driver:
+ await self.driver.quit()
+
+ def _cleanup_sync(self):
+ try:
+ if not self.driver:
+ return
+ proc = self.driver._process
+ if proc and proc.poll() is None:
+ proc.terminate()
try:
- # IMPORTANT: ensure no stale driver in PATH (e.g. C:\Windows\msedgedriver.exe v138)
- self.driver = webdriver.Edge(options=options)
- except SessionNotCreatedException as se:
- raise RuntimeError(
- "Selenium Manager fallback failed due to driver/browser mismatch.\n"
- "Fix by either: (a) removing stale msedgedriver in PATH (e.g. C:\\Windows\\msedgedriver.exe) and replace with a fresh one downloaded from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver, "
- "or (b) pass --edge-driver-path to a manually downloaded driver that matches your Edge version."
- ) from se
+ proc.wait(timeout=1)
+ except Exception:
+ proc.kill()
+ except Exception as exc:
+ print("_cleanup_sync failed:", exc)
+
+ async def login(self):
+ await self.driver.get("https://substack.com/sign-in")
+ await asyncio.sleep(2)
+
+ signin = await self.driver.find_element(
+ By.XPATH, "//a[contains(@class,'login-option')]"
+ )
+ await signin.click()
- self.login()
+ await asyncio.sleep(2)
- def login(self) -> None:
- """
- This method logs into Substack using Selenium
- """
- self.driver.get("https://substack.com/sign-in")
- sleep(3)
+ email = await self.driver.find_element(By.NAME, "email")
+ password = await self.driver.find_element(By.NAME, "password")
- signin_with_password = self.driver.find_element(
- By.XPATH, "//a[@class='login-option substack-login__login-option']"
+ await email.send_keys(self.args.email)
+ await password.send_keys(self.args.password)
+
+ submit = await self.driver.find_element(
+ By.XPATH, "//*[@id='substack-login']//form//button"
)
- signin_with_password.click()
- sleep(3)
-
- # Email and password
- email = self.driver.find_element(By.NAME, "email")
- password = self.driver.find_element(By.NAME, "password")
- email.send_keys(self.args.email)
- password.send_keys(self.args.password)
-
- # Find the submit button and click it.
- submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button")
- submit.click()
- sleep(30) # Wait for the page to load
-
- if self.is_login_failed():
- raise Exception(
- "Warning: Login unsuccessful. Please check your email and password, or your account status.\n"
- "Use the non-premium scraper for the non-paid posts. \n"
- "If running headless, run non-headlessly to see if blocked by Captcha."
- )
+ await submit.click()
- def is_login_failed(self) -> bool:
+ await asyncio.sleep(8)
+
+ if await self.is_login_failed():
+ raise RuntimeError("Substack login failed")
+
+ async def is_login_failed(self):
"""
Check for the presence of the 'error-container' to indicate a failed login attempt.
"""
- error_container = self.driver.find_elements(By.ID, 'error-container')
- return len(error_container) > 0 and error_container[0].is_displayed()
+ elements = await self.driver.find_elements(By.ID, "error-container")
+ return bool(elements)
- def get_url_soup(self, url: str) -> BeautifulSoup:
+ async def get_url_soup(self, url: str):
"""
Gets soup from URL using logged in selenium driver
"""
- try:
- self.driver.get(url)
- return BeautifulSoup(self.driver.page_source, "html.parser")
- except Exception as e:
- raise ValueError(f"Error fetching page: {e}") from e
+ await self.driver.get(url)
+ html = await self.driver.page_source
+ return BeautifulSoup(html, "html.parser")
def parse_args() -> argparse.Namespace:
@@ -548,16 +567,10 @@ def parse_args() -> argparse.Namespace:
"Scraper.",
)
parser.add_argument(
- "--edge-path",
+ "--chromium-path", # args.chromium_path
type=str,
default="",
- help='Optional: The path to the Edge browser executable (i.e. "path_to_msedge.exe").',
- )
- parser.add_argument(
- "--edge-driver-path",
- type=str,
- default="",
- help='Optional: The path to the Edge WebDriver executable (i.e. "path_to_msedgedriver.exe").',
+ help='Optional: The path to the Chromium browser executable (i.e. "path/to/chromium").',
)
parser.add_argument(
"--user-agent",
@@ -576,7 +589,7 @@ def parse_args() -> argparse.Namespace:
return parser.parse_args()
-def main():
+async def async_main():
args = parse_args()
if args.config:
@@ -594,7 +607,7 @@ def main():
if True:
if args.premium:
- scraper = PremiumSubstackScraper(
+ scraper = await PremiumSubstackScraper(
args=args,
base_substack_url=args.url,
headless=args.headless,
@@ -602,13 +615,19 @@ def main():
html_save_dir=args.html_directory
)
else:
- scraper = SubstackScraper(
+ scraper = await SubstackScraper(
args=args,
base_substack_url=args.url,
md_save_dir=args.directory,
html_save_dir=args.html_directory
)
- scraper.scrape_posts(args.number)
+
+ await scraper.scrape_posts(args.number)
+ await scraper.close()
+
+
+def main():
+ asyncio.run(async_main())
if __name__ == "__main__":
From 4af8b45d9a477df427a352b5038ea14d983451ce Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sun, 28 Dec 2025 10:07:52 +0100
Subject: [PATCH 05/11] replace existing files
---
src/substack2markdown/substack_scraper.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 0f22adc5..91c3f2ac 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -178,7 +178,8 @@ def save_to_file(filepath: str, content: str) -> None:
if not isinstance(content, str):
raise ValueError("content must be a string")
- if os.path.exists(filepath):
+ # if os.path.exists(filepath):
+ if False:
print(f"File already exists: {filepath}")
return
@@ -351,6 +352,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
md_filepath = os.path.join(self.md_save_dir, md_filename)
html_filepath = os.path.join(self.html_save_dir, html_filename)
+ # if not os.path.exists(md_filepath):
+ if True:
soup = await self.get_url_soup(url)
if soup is None:
total += 1
From 4b8598af5affd08af4a56e1329f47959f520777c Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sun, 28 Dec 2025 11:15:06 +0100
Subject: [PATCH 06/11] fixup: assets-dir
---
src/substack2markdown/substack_scraper.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 91c3f2ac..fc3bdabf 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -206,7 +206,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
# Calculate the relative path from the HTML file to the CSS file
html_dir = os.path.dirname(filepath)
- css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir)
+ css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir)
css_path = css_path.replace("\\", "/") # Ensure forward slashes for web paths
html_content = f"""
From 5811bb5de77a0302e619936afaa7fbe67d45b307 Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sun, 28 Dec 2025 12:34:02 +0100
Subject: [PATCH 07/11] download images
based on
https://github.com/timf34/Substack2Markdown/pull/26
---
src/substack2markdown/substack_scraper.py | 186 +++++++++++++++++++++-
1 file changed, 185 insertions(+), 1 deletion(-)
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index fc3bdabf..e2cc62ae 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -1,6 +1,13 @@
import argparse
import json
import os
+import io
+import re
+import base64
+import hashlib
+import mimetypes
+from pathlib import Path
+from urllib.parse import urlparse, unquote
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple
from time import sleep
@@ -18,18 +25,54 @@
from selenium_driverless import webdriver
from selenium_driverless.types.by import By
-from urllib.parse import urlparse
USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts
BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown
BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files
BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files
+BASE_IMAGE_DIR: str = "substack_images"
ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page
JSON_DATA_DIR: str = "data"
NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts
+def count_images_in_markdown(md_content: str) -> int:
+ """Count number of Substack CDN image URLs in markdown content."""
+ # [](https://substackcdn.com/image/fetch/x.png)
+ # regex lookahead: match "...)" but not "...)]" suffix
+ pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)')
+ matches = re.findall(pattern, md_content)
+ return len(matches)
+
+
+def sanitize_image_filename(url: str) -> str:
+ """Create a safe filename from URL or content."""
+ # Extract original filename from CDN URL
+ if "substackcdn.com" in url:
+ # Get the actual image URL after the CDN parameters
+ original_url = unquote(url.split("/https%3A%2F%2F")[1])
+ filename = original_url.split("/")[-1]
+ else:
+ filename = url.split("/")[-1]
+
+ # Remove invalid characters
+ filename = re.sub(r'[<>:"/\\|?*]', '', filename)
+
+ # If filename is too long or empty, create hash-based name
+ if len(filename) > 100 or not filename:
+ hash_object = hashlib.md5(url.encode())
+ ext = mimetypes.guess_extension(requests.head(url).headers.get('content-type', '')) or '.jpg'
+ filename = f"{hash_object.hexdigest()}{ext}"
+
+ return filename
+
+
+def get_post_slug(url: str) -> str:
+ match = re.search(r'/p/([^/]+)', url)
+ return match.group(1) if match else 'unknown_post'
+
+
def extract_main_part(url: str) -> str:
parts = urlparse(url).netloc.split('.') # Parse the URL to get the netloc, and split on '.'
return parts[1] if parts[0] == 'www' else parts[0] # Return the main part of the domain, while ignoring 'www' if
@@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
os.makedirs(self.html_save_dir)
print(f"Created html directory {self.html_save_dir}")
+ if not self.args.no_images:
+ os.makedirs(self.args.image_directory, exist_ok=True)
+
self.keywords: List[str] = ["about", "archive", "podcast"]
self.post_urls: List[str] = self.get_all_post_urls()
@@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
total += 1
continue
title, subtitle, like_count, date, md = self.extract_post_data(soup)
+
+ if not self.args.no_images:
+ total_images = count_images_in_markdown(md)
+ post_slug = get_post_slug(url)
+ with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
+ md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
+
self.save_to_file(md_filepath, md)
# Convert markdown to HTML and save
@@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
self.save_essays_data_to_json(essays_data=essays_data)
generate_html_file(self.args, author_name=self.writer_name)
+ async def download_image(
+ self,
+ url: str,
+ save_path: Path,
+ pbar: Optional[tqdm] = None
+ ) -> Optional[str]:
+ """Download image from URL and save to path."""
+ try:
+ response = requests.get(url, stream=True)
+ if response.status_code == 200:
+ save_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(save_path, 'wb') as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ if chunk:
+ f.write(chunk)
+ if pbar:
+ pbar.update(1)
+ return str(save_path)
+ except Exception as exc:
+ if pbar:
+ pbar.write(f"Error downloading image {url}: {str(exc)}")
+ # raise exc # debug
+ return None
+
+ async def process_markdown_images(
+ self,
+ md_content: str,
+ author: str,
+ post_slug: str,
+ pbar=None
+ ) -> str:
+ """Process markdown content to download images and update references."""
+ image_dir = Path(self.args.image_directory) / author / post_slug
+ # [](https://substackcdn.com/image/fetch/x.png)
+ pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)')
+ buf = io.StringIO()
+ last_end = 0
+ for match in pattern.finditer(md_content):
+ buf.write(md_content[last_end:match.start()])
+ url = match.group(0).strip("()")
+ filename = sanitize_image_filename(url)
+ save_path = image_dir / filename
+ if not save_path.exists():
+ await self.download_image(url, save_path, pbar)
+ rel_path = os.path.relpath(save_path, Path(self.args.directory) / author)
+ buf.write(f"({rel_path})")
+ last_end = match.end()
+ buf.write(md_content[last_end:])
+ return buf.getvalue()
+
class SubstackScraper(BaseSubstackScraper):
def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
@@ -515,6 +618,76 @@ async def get_url_soup(self, url: str):
html = await self.driver.page_source
return BeautifulSoup(html, "html.parser")
+ async def download_image_FIXME(
+ self,
+ url: str,
+ save_path: Path,
+ pbar: Optional[tqdm] = None
+ ) -> Optional[str]:
+ """Download image using selenium_driverless"""
+
+ # NOTE for now this works with the default "def download_image"
+
+ # WONTFIX "fetch" fails due to CORS policy
+
+ # WONTFIX "canvas" does not return the original image bytes
+
+ # we could fetch images with CDP Network.getResponseBody
+ # but that requires lots of boilerplate code
+ # fix: use https://github.com/milahu/aiohttp_chromium
+
+ try:
+ # Execute JS fetch inside browser
+ result = await self.driver.execute_async_script(
+ """
+ const url = arguments[0];
+ const callback = arguments[arguments.length - 1];
+
+ const img = new Image();
+ img.crossOrigin = 'Anonymous'; // try to avoid CORS issues
+ img.onload = () => {
+ try {
+ const canvas = document.createElement('canvas');
+ canvas.width = img.width;
+ canvas.height = img.height;
+ const ctx = canvas.getContext('2d');
+ ctx.drawImage(img, 0, 0);
+ const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..."
+ const base64 = dataUrl.split(',')[1]; // strip prefix
+ callback({data: base64});
+ } catch (err) {
+ callback({error: err.message, stack: err.stack});
+ }
+ };
+ img.onerror = (err) => {
+ callback({error: 'Image load error', stack: err.toString()});
+ };
+ img.src = url;
+ """,
+ url
+ )
+
+ if isinstance(result, dict) and "error" in result:
+ raise RuntimeError(f"{result['error']}\nJS stack:\n{result['stack']}")
+
+ # Decode base64 to bytes
+ image_bytes = base64.b64decode(result)
+
+ save_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(save_path, "wb") as f:
+ f.write(image_bytes)
+
+ if pbar:
+ pbar.update(1)
+
+ return str(save_path)
+
+ except Exception as exc:
+ if pbar:
+ pbar.write(f"Error downloading image {url}: {exc}")
+ # raise exc # debug
+ return None
+
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Scrape a Substack site.")
@@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace:
default=BASE_HTML_DIR,
help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
)
+ parser.add_argument(
+ "--image-directory", # args.image_directory
+ type=str,
+ default=BASE_IMAGE_DIR,
+ help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
+ )
+ parser.add_argument(
+ "--no-images", # args.no_images
+ action="store_true",
+ help=f"Do not download images.",
+ )
return parser.parse_args()
From 153746f5b76d664526db9580259d3be6aa4bcd59 Mon Sep 17 00:00:00 2001
From: Milan Hauth
Date: Sun, 28 Dec 2025 18:38:46 +0100
Subject: [PATCH 08/11] download comments
fix https://github.com/timf34/Substack2Markdown/issues/3
---
src/substack2markdown/substack_scraper.py | 129 ++++++++++++++++++++++
1 file changed, 129 insertions(+)
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index e2cc62ae..d2f2ad3a 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -363,6 +363,98 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st
return title, subtitle, like_count, date, md_content
+ async def get_window_preloads(self, soup):
+ # all comments are stored in javascript
+ #
+ # only some comments are rendered in html
+ # with buttons to "Expand full comment" and "Load More"
+ # see also
+ # https://www.selfpublife.com/p/automatically-expand-all-substack-comments
+ window_preloads = None
+ for script_element in soup.select("script"):
+ script_text = script_element.text.strip()
+ if not script_text.startswith("window._preloads"):
+ continue
+ # pos1 = re.search(r'window._preloads\s*=\s*JSON\.parse\(', script_text).span()[1]
+ pos1 = script_text.find("(") + 1
+ pos2 = script_text.rfind(")")
+ window_preloads = json.loads(json.loads(script_text[pos1:pos2]))
+ break
+ assert window_preloads, f"not found at {url!r}"
+ return window_preloads
+
+ def count_comments(self, comments_preloads):
+
+ def count_comments_inner(comment):
+ res = 1
+ for child_comment in comment["children"]:
+ res += count_comments_inner(child_comment)
+ return res
+
+ res = 0
+ for comment in comments_preloads["initialComments"]:
+ res += count_comments_inner(comment)
+ return res
+
+ def render_comments_html(self, comments_preloads):
+
+ def render_comment_body(body):
+ body = body.strip()
+ body = "" + body + "
"
+ body = body.replace("\n", "
\n")
+ # TODO more?
+ return body
+
+ def render_comments_html_inner(comment, buf):
+ assert comment["type"] == "comment", f'unexpected comment type: {comment["type"]!r}'
+ buf.write(f'\n')
+ buf.write('\n')
+
+ buf = io.StringIO()
+ # NOTE the name "initial" is misleading. all comments are stored in this array
+ # NOTE comments are sorted by likes
+ for comment in comments_preloads["initialComments"]:
+ render_comments_html_inner(comment, buf)
+ return buf.getvalue()
@abstractmethod
def get_url_soup(self, url: str) -> str:
@@ -412,6 +504,37 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
+ comments_html = None
+ comments_num = None
+ if not self.args.no_comments:
+ comments_url = url + "/comments"
+ # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test
+ comments_soup = await self.get_url_soup(comments_url)
+ comments_preloads = await self.get_window_preloads(comments_soup)
+ if 0:
+ # debug
+ # TODO add option to write the original "preloads" data to json files
+ with open("comments_preloads.json", "w") as f:
+ json.dump(comments_preloads, f, indent=2)
+ raise 5
+ comments_num = self.count_comments(comments_preloads)
+ if comments_num > 0:
+ comments_html = self.render_comments_html(comments_preloads)
+ comments_html = (
+ '\n\n' +
+ '
\n' +
+ # this can collide with other elements with id="comments"
+ # '
Comments
\n' + + '{comments_num} comments
\n' + + comments_html + '\n' + + '