From c520fb3a8d64a6dde29d6f8d269213f9a14fbd6e Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sat, 27 Dec 2025 18:39:38 +0100
Subject: [PATCH 01/11] add setup.py

---
 config.py                                           |   2 --
 data/README.md                                      |   3 ---
 src/substack2markdown/__init__.py                   |   0
 .../substack2markdown/assets/author_template.html   |   0
 .../substack2markdown/assets}/css/essay-styles.css  |   0
 .../substack2markdown/assets}/css/style.css         |   0
 .../substack2markdown/assets}/images/screenshot.png | Bin
 .../substack2markdown/assets}/js/populate-essays.js |   0
 .../substack2markdown/substack_scraper.py           |   0
 substack_html_pages/README.md                       |   3 ---
 10 files changed, 8 deletions(-)
 delete mode 100644 config.py
 delete mode 100644 data/README.md
 create mode 100644 src/substack2markdown/__init__.py
 rename author_template.html => src/substack2markdown/assets/author_template.html (100%)
 rename {assets => src/substack2markdown/assets}/css/essay-styles.css (100%)
 rename {assets => src/substack2markdown/assets}/css/style.css (100%)
 rename {assets => src/substack2markdown/assets}/images/screenshot.png (100%)
 rename {assets => src/substack2markdown/assets}/js/populate-essays.js (100%)
 rename substack_scraper.py => src/substack2markdown/substack_scraper.py (100%)
 delete mode 100644 substack_html_pages/README.md

diff --git a/config.py b/config.py
deleted file mode 100644
index 8fc6bff2..00000000
--- a/config.py
+++ /dev/null
@@ -1,2 +0,0 @@
-EMAIL = "your-email@domain.com"
-PASSWORD = "your-password"
diff --git a/data/README.md b/data/README.md
deleted file mode 100644
index 27476ca6..00000000
--- a/data/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory will be used to store `.json` files for each writer 
-containing metadata that is used to populate a `.html` file for that
-author. 
\ No newline at end of file
diff --git a/src/substack2markdown/__init__.py b/src/substack2markdown/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/author_template.html b/src/substack2markdown/assets/author_template.html
similarity index 100%
rename from author_template.html
rename to src/substack2markdown/assets/author_template.html
diff --git a/assets/css/essay-styles.css b/src/substack2markdown/assets/css/essay-styles.css
similarity index 100%
rename from assets/css/essay-styles.css
rename to src/substack2markdown/assets/css/essay-styles.css
diff --git a/assets/css/style.css b/src/substack2markdown/assets/css/style.css
similarity index 100%
rename from assets/css/style.css
rename to src/substack2markdown/assets/css/style.css
diff --git a/assets/images/screenshot.png b/src/substack2markdown/assets/images/screenshot.png
similarity index 100%
rename from assets/images/screenshot.png
rename to src/substack2markdown/assets/images/screenshot.png
diff --git a/assets/js/populate-essays.js b/src/substack2markdown/assets/js/populate-essays.js
similarity index 100%
rename from assets/js/populate-essays.js
rename to src/substack2markdown/assets/js/populate-essays.js
diff --git a/substack_scraper.py b/src/substack2markdown/substack_scraper.py
similarity index 100%
rename from substack_scraper.py
rename to src/substack2markdown/substack_scraper.py
diff --git a/substack_html_pages/README.md b/substack_html_pages/README.md
deleted file mode 100644
index 0931cf8e..00000000
--- a/substack_html_pages/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory will be used to store `.html` files for each writer that will enable you 
-to browse and sort the downloaded markdown files for a given writer. One `.html` file 
-will be created for each writer. 
\ No newline at end of file

From c73855e0a8aa17b916863c895acf8d6e741a2cf3 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sat, 27 Dec 2025 19:05:33 +0100
Subject: [PATCH 02/11] add parameters: config email password

---
 src/substack2markdown/substack_scraper.py | 42 ++++++++++++++++++-----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 126d260d..734dc553 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -21,7 +21,6 @@
 from selenium.common.exceptions import SessionNotCreatedException
 from selenium.webdriver.chrome.service import Service
 from urllib.parse import urlparse
-from config import EMAIL, PASSWORD
 
 USE_PREMIUM: bool = True  # Set to True if you want to login to Substack and convert paid for posts
 BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/"  # Substack you want to convert to markdown
@@ -70,9 +69,10 @@ def generate_html_file(author_name: str) -> None:
 
 
 class BaseSubstackScraper(ABC):
-    def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str):
+    def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
         if not base_substack_url.endswith("/"):
             base_substack_url += "/"
+        self.args = args
         self.base_substack_url: str = base_substack_url
 
         self.writer_name: str = extract_main_part(base_substack_url)
@@ -371,8 +371,8 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
 
 
 class SubstackScraper(BaseSubstackScraper):
-    def __init__(self, base_substack_url: str, md_save_dir: str, html_save_dir: str):
-        super().__init__(base_substack_url, md_save_dir, html_save_dir)
+    def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
+        super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
 
     def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
         """
@@ -392,6 +392,7 @@ def get_url_soup(self, url: str) -> Optional[BeautifulSoup]:
 class PremiumSubstackScraper(BaseSubstackScraper):
     def __init__(
         self,
+        args,
         base_substack_url: str,
         md_save_dir: str,
         html_save_dir: str,
@@ -400,7 +401,7 @@ def __init__(
         edge_driver_path: str = '',
         user_agent: str = ''
     ) -> None:
-        super().__init__(base_substack_url, md_save_dir, html_save_dir)
+        super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
 
         options = EdgeOptions()
         if headless:
@@ -459,8 +460,8 @@ def login(self) -> None:
         # Email and password
         email = self.driver.find_element(By.NAME, "email")
         password = self.driver.find_element(By.NAME, "password")
-        email.send_keys(EMAIL)
-        password.send_keys(PASSWORD)
+        email.send_keys(self.args.email)
+        password.send_keys(self.args.password)
 
         # Find the submit button and click it.
         submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button")
@@ -494,6 +495,15 @@ def get_url_soup(self, url: str) -> BeautifulSoup:
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Scrape a Substack site.")
+    parser.add_argument(
+        "--config", type=str, help="JSON config file with email and password."
+    )
+    parser.add_argument(
+        "--email", type=str, help="Login E-Mail."
+    )
+    parser.add_argument(
+        "--password", type=str, help="Login password."
+    )
     parser.add_argument(
         "-u", "--url", type=str, help="The base URL of the Substack site to scrape."
     )
@@ -556,17 +566,29 @@ def main():
     if args.html_directory is None:
         args.html_directory = BASE_HTML_DIR
 
+    if args.config:
+        with open(args.config) as f:
+            config = json.load(f)
+        args.email = config["email"]
+        args.password = config["password"]
+        # TODO more
+
+    assert args.email
+    assert args.password
+
     if args.url:
         if args.premium:
             scraper = PremiumSubstackScraper(
-                args.url,
+                args=args,
+                base_substack_url=args.url,
                 headless=args.headless,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory
             )
         else:
             scraper = SubstackScraper(
-                args.url,
+                args=args,
+                base_substack_url=args.url,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory
             )
@@ -575,6 +597,7 @@ def main():
     else:  # Use the hardcoded values at the top of the file
         if USE_PREMIUM:
             scraper = PremiumSubstackScraper(
+                args=args,
                 base_substack_url=BASE_SUBSTACK_URL,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory,
@@ -583,6 +606,7 @@ def main():
             )
         else:
             scraper = SubstackScraper(
+                args=args,
                 base_substack_url=BASE_SUBSTACK_URL,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory

From 15fc25c71720c2afba4b3bafceabb91b1ed672bb Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 07:55:10 +0100
Subject: [PATCH 03/11] add parameters: assets-dir author-template

---
 src/substack2markdown/substack_scraper.py | 73 +++++++++++------------
 1 file changed, 35 insertions(+), 38 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 734dc553..297f0937 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -26,6 +26,7 @@
 BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/"  # Substack you want to convert to markdown
 BASE_MD_DIR: str = "substack_md_files"  # Name of the directory we'll save the .md essay files
 BASE_HTML_DIR: str = "substack_html_pages"  # Name of the directory we'll save the .html essay files
+ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
 JSON_DATA_DIR: str = "data"
 NUM_POSTS_TO_SCRAPE: int = 3  # Set to 0 if you want all posts
@@ -37,12 +38,12 @@ def extract_main_part(url: str) -> str:
     # present
 
 
-def generate_html_file(author_name: str) -> None:
+def generate_html_file(args, author_name: str) -> None:
     """
     Generates a HTML file for the given author.
     """
-    if not os.path.exists(BASE_HTML_DIR):
-        os.makedirs(BASE_HTML_DIR)
+    if not os.path.exists(args.html_directory):
+        os.makedirs(args.html_directory)
 
     # Read JSON data
     json_path = os.path.join(JSON_DATA_DIR, f'{author_name}.json')
@@ -52,7 +53,7 @@ def generate_html_file(author_name: str) -> None:
     # Convert JSON data to a JSON string for embedding
     embedded_json_data = json.dumps(essays_data, ensure_ascii=False, indent=4)
 
-    with open(HTML_TEMPLATE, 'r', encoding='utf-8') as file:
+    with open(args.author_template, 'r', encoding='utf-8') as file:
         html_template = file.read()
 
     # Insert the JSON string into the script tag in the HTML template
@@ -63,7 +64,7 @@ def generate_html_file(author_name: str) -> None:
     html_with_author = html_with_data.replace('author_name', author_name)
 
     # Write the modified HTML to a new file
-    html_output_path = os.path.join(BASE_HTML_DIR, f'{author_name}.html')
+    html_output_path = os.path.join(args.html_directory, f'{author_name}.html')
     with open(html_output_path, 'w', encoding='utf-8') as file:
         file.write(html_with_author)
 
@@ -193,7 +194,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
 
         # Calculate the relative path from the HTML file to the CSS file
         html_dir = os.path.dirname(filepath)
-        css_path = os.path.relpath("./assets/css/essay-styles.css", html_dir)
+        css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir)
         css_path = css_path.replace("\\", "/")  # Ensure forward slashes for web paths
 
         html_content = f"""
@@ -367,7 +368,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
             if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
                 break
         self.save_essays_data_to_json(essays_data=essays_data)
-        generate_html_file(author_name=self.writer_name)
+        generate_html_file(self.args, author_name=self.writer_name)
 
 
 class SubstackScraper(BaseSubstackScraper):
@@ -505,14 +506,22 @@ def parse_args() -> argparse.Namespace:
         "--password", type=str, help="Login password."
     )
     parser.add_argument(
-        "-u", "--url", type=str, help="The base URL of the Substack site to scrape."
+        "-u",
+        "--url", # args.url
+        type=str,
+        default=BASE_SUBSTACK_URL,
+        help="The base URL of the Substack site to scrape."
     )
     parser.add_argument(
-        "-d", "--directory", type=str, help="The directory to save scraped posts."
+        "-d",
+        "--directory", # args.directory
+        type=str,
+        default=BASE_MD_DIR,
+        help="The directory to save scraped posts."
     )
     parser.add_argument(
         "-n",
-        "--number",
+        "--number", # args.number
         type=int,
         default=0,
         help="The number of posts to scrape. If 0 or not provided, all posts will be scraped.",
@@ -523,6 +532,15 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Include -p in command to use the Premium Substack Scraper with selenium.",
     )
+    parser.add_argument(
+        "--assets-dir", # args.assets_dir
+        default=ASSETS_DIR,
+        help=f"Path to assets directory. Default: {ASSETS_DIR!r}",
+    )
+    parser.add_argument(
+        "--author-template", # args.author_template
+        help=f"Path to author_template.html. Default: {repr('{assets_dir}/' + HTML_TEMPLATE)}",
+    )
     parser.add_argument(
         "--headless",
         action="store_true",
@@ -549,9 +567,10 @@ def parse_args() -> argparse.Namespace:
         "passing captcha in headless mode",
     )
     parser.add_argument(
-        "--html-directory",
+        "--html-directory", # args.html_directory
         type=str,
-        help="The directory to save scraped posts as HTML files.",
+        default=BASE_HTML_DIR,
+        help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
     )
 
     return parser.parse_args()
@@ -560,12 +579,6 @@ def parse_args() -> argparse.Namespace:
 def main():
     args = parse_args()
 
-    if args.directory is None:
-        args.directory = BASE_MD_DIR
-
-    if args.html_directory is None:
-        args.html_directory = BASE_HTML_DIR
-
     if args.config:
         with open(args.config) as f:
             config = json.load(f)
@@ -576,7 +589,10 @@ def main():
     assert args.email
     assert args.password
 
-    if args.url:
+    if not args.author_template:
+        args.author_template = args.assets_dir + "/" + HTML_TEMPLATE
+
+    if True:
         if args.premium:
             scraper = PremiumSubstackScraper(
                 args=args,
@@ -594,25 +610,6 @@ def main():
             )
         scraper.scrape_posts(args.number)
 
-    else:  # Use the hardcoded values at the top of the file
-        if USE_PREMIUM:
-            scraper = PremiumSubstackScraper(
-                args=args,
-                base_substack_url=BASE_SUBSTACK_URL,
-                md_save_dir=args.directory,
-                html_save_dir=args.html_directory,
-                edge_path=args.edge_path,
-                edge_driver_path=args.edge_driver_path
-            )
-        else:
-            scraper = SubstackScraper(
-                args=args,
-                base_substack_url=BASE_SUBSTACK_URL,
-                md_save_dir=args.directory,
-                html_save_dir=args.html_directory
-            )
-        scraper.scrape_posts(num_posts_to_scrape=NUM_POSTS_TO_SCRAPE)
-
 
 if __name__ == "__main__":
     main()

From 8d7676f947cfad94cf32eb013ed12f32c2bc90f8 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 10:05:02 +0100
Subject: [PATCH 04/11] use selenium_driverless

---
 requirements.txt                          |   3 +-
 src/substack2markdown/substack_scraper.py | 205 ++++++++++++----------
 2 files changed, 113 insertions(+), 95 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c58926a7..af704d1a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 bs4==0.0.1
 html2text==2020.1.16
 requests==2.31.0
-selenium==4.16.0
+selenium-driverless
 tqdm==4.66.1
-webdriver_manager==4.0.1
 Markdown==3.6
diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 297f0937..0f22adc5 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -4,7 +4,9 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 from time import sleep
-
+import asyncio
+import atexit
+import signal
 
 import html2text
 import markdown
@@ -14,12 +16,8 @@
 from tqdm import tqdm
 from xml.etree import ElementTree as ET
 
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from webdriver_manager.microsoft import EdgeChromiumDriverManager
-from selenium.webdriver.edge.options import Options as EdgeOptions
-from selenium.common.exceptions import SessionNotCreatedException
-from selenium.webdriver.chrome.service import Service
+from selenium_driverless import webdriver
+from selenium_driverless.types.by import By
 from urllib.parse import urlparse
 
 USE_PREMIUM: bool = True  # Set to True if you want to login to Substack and convert paid for posts
@@ -70,6 +68,15 @@ def generate_html_file(args, author_name: str) -> None:
 
 
 class BaseSubstackScraper(ABC):
+    def __await__(self):
+        return self._async_init().__await__()
+
+    async def __aenter__(self):
+        return await self
+
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+
     def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
         if not base_substack_url.endswith("/"):
             base_substack_url += "/"
@@ -92,6 +99,10 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
 
+    async def _async_init(self):
+        self._loop = asyncio.get_running_loop()
+        return self
+
     def get_all_post_urls(self) -> List[str]:
         """
         Attempts to fetch URLs from sitemap.xml, falling back to feed.xml if necessary.
@@ -326,7 +337,7 @@ def save_essays_data_to_json(self, essays_data: list) -> None:
         with open(json_path, 'w', encoding='utf-8') as f:
             json.dump(essays_data, f, ensure_ascii=False, indent=4)
 
-    def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
+    async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         """
         Iterates over all posts and saves them as markdown and html files
         """
@@ -340,8 +351,7 @@ def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 md_filepath = os.path.join(self.md_save_dir, md_filename)
                 html_filepath = os.path.join(self.html_save_dir, html_filename)
 
-                if not os.path.exists(md_filepath):
-                    soup = self.get_url_soup(url)
+                    soup = await self.get_url_soup(url)
                     if soup is None:
                         total += 1
                         continue
@@ -398,100 +408,109 @@ def __init__(
         md_save_dir: str,
         html_save_dir: str,
         headless: bool = False,
-        edge_path: str = '',
-        edge_driver_path: str = '',
+        chromium_path: str = '',
         user_agent: str = ''
     ) -> None:
         super().__init__(args, base_substack_url, md_save_dir, html_save_dir)
 
-        options = EdgeOptions()
+        self.driver = None
+
+        def exit_handler(signum, frame):
+            print()
+            print(f"exit_handler: received signal {signum}")
+            try:
+                asyncio.get_event_loop().create_task(self._cleanup_sync())
+            except Exception:
+                pass
+            raise SystemExit(0)
+
+        signal.signal(signal.SIGINT, exit_handler)
+        signal.signal(signal.SIGTERM, exit_handler)
+
+        atexit.register(self._cleanup_sync)
+
+        options = webdriver.ChromeOptions()
+        self.chrome_options = options
         if headless:
-            # modern headless flag (works better with recent Edge/Chromium)
+            # modern headless flag (works better with recent Chromium)
             options.add_argument("--headless=new")
-        if edge_path:
-            options.binary_location = edge_path
+        if chromium_path:
+            options.binary_location = chromium_path
         if user_agent:
             options.add_argument(f"user-agent={user_agent}")
-    
-        if isinstance(options, EdgeOptions):
-            os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://msedgedriver.microsoft.com")
-        elif isinstance(options, ChromeOptions):
-            os.environ.setdefault("SE_DRIVER_MIRROR_URL", "https://chromedriver.storage.googleapis.com")
 
-        
-        self.driver = None
+    async def _async_init(self):
+        self._loop = asyncio.get_running_loop()
 
-        # 1) Prefer an explicit driver path (manual download)
-        if edge_driver_path and os.path.exists(edge_driver_path):
-            service = Service(executable_path=edge_driver_path)
-            self.driver = webdriver.Edge(service=service, options=options)
-        else:
-            # 2) Try webdriver_manager (needs network/DNS)
-            try:
-                service = Service(EdgeChromiumDriverManager().install())
-                self.driver = webdriver.Edge(service=service, options=options)
-            except Exception as e:
-                print("webdriver_manager could not download msedgedriver (network/DNS). Falling back to Selenium Manager.")
-                # 3) Selenium Manager fallback (still needs network; but avoids webdriver_manager)
+        await self._start_driver()
+        await self.login()
+        return self
+
+    async def _start_driver(self):
+        self.driver = await webdriver.Chrome(options=self.chrome_options)
+
+    async def __aexit__(self, exc_type, exc, tb):
+        await self.close()
+
+    async def close(self) -> None:
+        if self.driver:
+            await self.driver.quit()
+
+    def _cleanup_sync(self):
+        try:
+            if not self.driver:
+                return
+            proc = self.driver._process
+            if proc and proc.poll() is None:
+                proc.terminate()
                 try:
-                    # IMPORTANT: ensure no stale driver in PATH (e.g. C:\Windows\msedgedriver.exe v138)
-                    self.driver = webdriver.Edge(options=options)
-                except SessionNotCreatedException as se:
-                    raise RuntimeError(
-                        "Selenium Manager fallback failed due to driver/browser mismatch.\n"
-                        "Fix by either: (a) removing stale msedgedriver in PATH (e.g. C:\\Windows\\msedgedriver.exe) and replace with a fresh one downloaded from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver, "
-                        "or (b) pass --edge-driver-path to a manually downloaded driver that matches your Edge version."
-                    ) from se
+                    proc.wait(timeout=1)
+                except Exception:
+                    proc.kill()
+        except Exception as exc:
+            print("_cleanup_sync failed:", exc)
+
+    async def login(self):
+        await self.driver.get("https://substack.com/sign-in")
+        await asyncio.sleep(2)
+
+        signin = await self.driver.find_element(
+            By.XPATH, "//a[contains(@class,'login-option')]"
+        )
+        await signin.click()
 
-        self.login()
+        await asyncio.sleep(2)
 
-    def login(self) -> None:
-        """
-        This method logs into Substack using Selenium
-        """
-        self.driver.get("https://substack.com/sign-in")
-        sleep(3)
+        email = await self.driver.find_element(By.NAME, "email")
+        password = await self.driver.find_element(By.NAME, "password")
 
-        signin_with_password = self.driver.find_element(
-            By.XPATH, "//a[@class='login-option substack-login__login-option']"
+        await email.send_keys(self.args.email)
+        await password.send_keys(self.args.password)
+
+        submit = await self.driver.find_element(
+            By.XPATH, "//*[@id='substack-login']//form//button"
         )
-        signin_with_password.click()
-        sleep(3)
-
-        # Email and password
-        email = self.driver.find_element(By.NAME, "email")
-        password = self.driver.find_element(By.NAME, "password")
-        email.send_keys(self.args.email)
-        password.send_keys(self.args.password)
-
-        # Find the submit button and click it.
-        submit = self.driver.find_element(By.XPATH, "//*[@id=\"substack-login\"]/div[2]/div[2]/form/button")
-        submit.click()
-        sleep(30)  # Wait for the page to load
-
-        if self.is_login_failed():
-            raise Exception(
-                "Warning: Login unsuccessful. Please check your email and password, or your account status.\n"
-                "Use the non-premium scraper for the non-paid posts. \n"
-                "If running headless, run non-headlessly to see if blocked by Captcha."
-            )
+        await submit.click()
 
-    def is_login_failed(self) -> bool:
+        await asyncio.sleep(8)
+
+        if await self.is_login_failed():
+            raise RuntimeError("Substack login failed")
+
+    async def is_login_failed(self):
         """
         Check for the presence of the 'error-container' to indicate a failed login attempt.
         """
-        error_container = self.driver.find_elements(By.ID, 'error-container')
-        return len(error_container) > 0 and error_container[0].is_displayed()
+        elements = await self.driver.find_elements(By.ID, "error-container")
+        return bool(elements)
 
-    def get_url_soup(self, url: str) -> BeautifulSoup:
+    async def get_url_soup(self, url: str):
         """
         Gets soup from URL using logged in selenium driver
         """
-        try:
-            self.driver.get(url)
-            return BeautifulSoup(self.driver.page_source, "html.parser")
-        except Exception as e:
-            raise ValueError(f"Error fetching page: {e}") from e
+        await self.driver.get(url)
+        html = await self.driver.page_source
+        return BeautifulSoup(html, "html.parser")
 
 
 def parse_args() -> argparse.Namespace:
@@ -548,16 +567,10 @@ def parse_args() -> argparse.Namespace:
         "Scraper.",
     )
     parser.add_argument(
-        "--edge-path",
+        "--chromium-path", # args.chromium_path
         type=str,
         default="",
-        help='Optional: The path to the Edge browser executable (i.e. "path_to_msedge.exe").',
-    )
-    parser.add_argument(
-        "--edge-driver-path",
-        type=str,
-        default="",
-        help='Optional: The path to the Edge WebDriver executable (i.e. "path_to_msedgedriver.exe").',
+        help='Optional: The path to the Chromium browser executable (i.e. "path/to/chromium").',
     )
     parser.add_argument(
         "--user-agent",
@@ -576,7 +589,7 @@ def parse_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def main():
+async def async_main():
     args = parse_args()
 
     if args.config:
@@ -594,7 +607,7 @@ def main():
 
     if True:
         if args.premium:
-            scraper = PremiumSubstackScraper(
+            scraper = await PremiumSubstackScraper(
                 args=args,
                 base_substack_url=args.url,
                 headless=args.headless,
@@ -602,13 +615,19 @@ def main():
                 html_save_dir=args.html_directory
             )
         else:
-            scraper = SubstackScraper(
+            scraper = await SubstackScraper(
                 args=args,
                 base_substack_url=args.url,
                 md_save_dir=args.directory,
                 html_save_dir=args.html_directory
             )
-        scraper.scrape_posts(args.number)
+
+        await scraper.scrape_posts(args.number)
+        await scraper.close()
+
+
+def main():
+    asyncio.run(async_main())
 
 
 if __name__ == "__main__":

From 4af8b45d9a477df427a352b5038ea14d983451ce Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 10:07:52 +0100
Subject: [PATCH 05/11] replace existing files

---
 src/substack2markdown/substack_scraper.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 0f22adc5..91c3f2ac 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -178,7 +178,8 @@ def save_to_file(filepath: str, content: str) -> None:
         if not isinstance(content, str):
             raise ValueError("content must be a string")
 
-        if os.path.exists(filepath):
+        # if os.path.exists(filepath):
+        if False:
             print(f"File already exists: {filepath}")
             return
 
@@ -351,6 +352,8 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                 md_filepath = os.path.join(self.md_save_dir, md_filename)
                 html_filepath = os.path.join(self.html_save_dir, html_filename)
 
+                # if not os.path.exists(md_filepath):
+                if True:
                     soup = await self.get_url_soup(url)
                     if soup is None:
                         total += 1

From 4b8598af5affd08af4a56e1329f47959f520777c Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 11:15:06 +0100
Subject: [PATCH 06/11] fixup: assets-dir

---
 src/substack2markdown/substack_scraper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 91c3f2ac..fc3bdabf 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -206,7 +206,7 @@ def save_to_html_file(self, filepath: str, content: str) -> None:
 
         # Calculate the relative path from the HTML file to the CSS file
         html_dir = os.path.dirname(filepath)
-        css_path = os.path.relpath(args.assets_dir + "/css/essay-styles.css", html_dir)
+        css_path = os.path.relpath(self.args.assets_dir + "/css/essay-styles.css", html_dir)
         css_path = css_path.replace("\\", "/")  # Ensure forward slashes for web paths
 
         html_content = f"""

From 5811bb5de77a0302e619936afaa7fbe67d45b307 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 12:34:02 +0100
Subject: [PATCH 07/11] download images

based on
https://github.com/timf34/Substack2Markdown/pull/26
---
 src/substack2markdown/substack_scraper.py | 186 +++++++++++++++++++++-
 1 file changed, 185 insertions(+), 1 deletion(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index fc3bdabf..e2cc62ae 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -1,6 +1,13 @@
 import argparse
 import json
 import os
+import io
+import re
+import base64
+import hashlib
+import mimetypes
+from pathlib import Path
+from urllib.parse import urlparse, unquote
 from abc import ABC, abstractmethod
 from typing import List, Optional, Tuple
 from time import sleep
@@ -18,18 +25,54 @@
 
 from selenium_driverless import webdriver
 from selenium_driverless.types.by import By
-from urllib.parse import urlparse
 
 USE_PREMIUM: bool = True  # Set to True if you want to login to Substack and convert paid for posts
 BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/"  # Substack you want to convert to markdown
 BASE_MD_DIR: str = "substack_md_files"  # Name of the directory we'll save the .md essay files
 BASE_HTML_DIR: str = "substack_html_pages"  # Name of the directory we'll save the .html essay files
+BASE_IMAGE_DIR: str = "substack_images"
 ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
 JSON_DATA_DIR: str = "data"
 NUM_POSTS_TO_SCRAPE: int = 3  # Set to 0 if you want all posts
 
 
+def count_images_in_markdown(md_content: str) -> int:
+    """Count number of Substack CDN image URLs in markdown content."""
+    # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
+    # regex lookahead: match "...)" but not "...)]" suffix
+    pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)(?=[^\]]|$)')
+    matches = re.findall(pattern, md_content)
+    return len(matches)
+
+
+def sanitize_image_filename(url: str) -> str:
+    """Create a safe filename from URL or content."""
+    # Extract original filename from CDN URL
+    if "substackcdn.com" in url:
+        # Get the actual image URL after the CDN parameters
+        original_url = unquote(url.split("/https%3A%2F%2F")[1])
+        filename = original_url.split("/")[-1]
+    else:
+        filename = url.split("/")[-1]
+
+    # Remove invalid characters
+    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
+
+    # If filename is too long or empty, create hash-based name
+    if len(filename) > 100 or not filename:
+        hash_object = hashlib.md5(url.encode())
+        ext = mimetypes.guess_extension(requests.head(url).headers.get('content-type', '')) or '.jpg'
+        filename = f"{hash_object.hexdigest()}{ext}"
+
+    return filename
+
+
+def get_post_slug(url: str) -> str:
+    match = re.search(r'/p/([^/]+)', url)
+    return match.group(1) if match else 'unknown_post'
+
+
 def extract_main_part(url: str) -> str:
     parts = urlparse(url).netloc.split('.')  # Parse the URL to get the netloc, and split on '.'
     return parts[1] if parts[0] == 'www' else parts[0]  # Return the main part of the domain, while ignoring 'www' if
@@ -96,6 +139,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
             os.makedirs(self.html_save_dir)
             print(f"Created html directory {self.html_save_dir}")
 
+        if not self.args.no_images:
+            os.makedirs(self.args.image_directory, exist_ok=True)
+
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
 
@@ -359,6 +405,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         total += 1
                         continue
                     title, subtitle, like_count, date, md = self.extract_post_data(soup)
+
+                    if not self.args.no_images:
+                        total_images = count_images_in_markdown(md)
+                        post_slug = get_post_slug(url)
+                        with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
+                            md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
+
                     self.save_to_file(md_filepath, md)
 
                     # Convert markdown to HTML and save
@@ -383,6 +436,56 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
         self.save_essays_data_to_json(essays_data=essays_data)
         generate_html_file(self.args, author_name=self.writer_name)
 
+    async def download_image(
+            self,
+            url: str,
+            save_path: Path,
+            pbar: Optional[tqdm] = None
+        ) -> Optional[str]:
+        """Download image from URL and save to path."""
+        try:
+            response = requests.get(url, stream=True)
+            if response.status_code == 200:
+                save_path.parent.mkdir(parents=True, exist_ok=True)
+                with open(save_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+                if pbar:
+                    pbar.update(1)
+                return str(save_path)
+        except Exception as exc:
+            if pbar:
+                pbar.write(f"Error downloading image {url}: {str(exc)}")
+            # raise exc # debug
+        return None
+
+    async def process_markdown_images(
+            self,
+            md_content: str,
+            author: str,
+            post_slug: str,
+            pbar=None
+        ) -> str:
+        """Process markdown content to download images and update references."""
+        image_dir = Path(self.args.image_directory) / author / post_slug
+        # [![](https://substackcdn.com/image/fetch/x.png)](https://substackcdn.com/image/fetch/x.png)
+        pattern = re.compile(r'\(https://substackcdn\.com/image/fetch/[^\s\)]+\)')
+        buf = io.StringIO()
+        last_end = 0
+        for match in pattern.finditer(md_content):
+            buf.write(md_content[last_end:match.start()])
+            url = match.group(0).strip("()")
+            filename = sanitize_image_filename(url)
+            save_path = image_dir / filename
+            if not save_path.exists():
+                await self.download_image(url, save_path, pbar)
+            rel_path = os.path.relpath(save_path, Path(self.args.directory) / author)
+            buf.write(f"({rel_path})")
+            last_end = match.end()
+        buf.write(md_content[last_end:])
+        return buf.getvalue()
+
 
 class SubstackScraper(BaseSubstackScraper):
     def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir: str):
@@ -515,6 +618,76 @@ async def get_url_soup(self, url: str):
         html = await self.driver.page_source
         return BeautifulSoup(html, "html.parser")
 
+    async def download_image_FIXME(
+            self,
+            url: str,
+            save_path: Path,
+            pbar: Optional[tqdm] = None
+        ) -> Optional[str]:
+        """Download image using selenium_driverless"""
+
+        # NOTE for now this works with the default "def download_image"
+
+        # WONTFIX "fetch" fails due to CORS policy
+
+        # WONTFIX "canvas" does not return the original image bytes
+
+        # we could fetch images with CDP Network.getResponseBody
+        # but that requires lots of boilerplate code
+        # fix: use https://github.com/milahu/aiohttp_chromium
+
+        try:
+            # Execute JS fetch inside browser
+            result = await self.driver.execute_async_script(
+                """
+                const url = arguments[0];
+                const callback = arguments[arguments.length - 1];
+
+                const img = new Image();
+                img.crossOrigin = 'Anonymous'; // try to avoid CORS issues
+                img.onload = () => {
+                    try {
+                        const canvas = document.createElement('canvas');
+                        canvas.width = img.width;
+                        canvas.height = img.height;
+                        const ctx = canvas.getContext('2d');
+                        ctx.drawImage(img, 0, 0);
+                        const dataUrl = canvas.toDataURL('image/png'); // returns "data:image/png;base64,..."
+                        const base64 = dataUrl.split(',')[1]; // strip prefix
+                        callback({data: base64});
+                    } catch (err) {
+                        callback({error: err.message, stack: err.stack});
+                    }
+                };
+                img.onerror = (err) => {
+                    callback({error: 'Image load error', stack: err.toString()});
+                };
+                img.src = url;
+                """,
+                url
+            )
+
+            if isinstance(result, dict) and "error" in result:
+                raise RuntimeError(f"{result['error']}\nJS stack:\n{result['stack']}")
+
+            # Decode base64 to bytes
+            image_bytes = base64.b64decode(result)
+
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(save_path, "wb") as f:
+                f.write(image_bytes)
+
+            if pbar:
+                pbar.update(1)
+
+            return str(save_path)
+
+        except Exception as exc:
+            if pbar:
+                pbar.write(f"Error downloading image {url}: {exc}")
+            # raise exc # debug
+            return None
+
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Scrape a Substack site.")
@@ -588,6 +761,17 @@ def parse_args() -> argparse.Namespace:
         default=BASE_HTML_DIR,
         help=f"The directory to save scraped posts as HTML files. Default: {BASE_HTML_DIR!r}",
     )
+    parser.add_argument(
+        "--image-directory", # args.image_directory
+        type=str,
+        default=BASE_IMAGE_DIR,
+        help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
+    )
+    parser.add_argument(
+        "--no-images", # args.no_images
+        action="store_true",
+        help=f"Do not download images.",
+    )
 
     return parser.parse_args()
 

From 153746f5b76d664526db9580259d3be6aa4bcd59 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 18:38:46 +0100
Subject: [PATCH 08/11] download comments

fix https://github.com/timf34/Substack2Markdown/issues/3
---
 src/substack2markdown/substack_scraper.py | 129 ++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index e2cc62ae..d2f2ad3a 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -363,6 +363,98 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st
 
         return title, subtitle, like_count, date, md_content
 
+    async def get_window_preloads(self, soup):
+        # all comments are stored in javascript
+        # <script>window._preloads = JSON.parse("{\"isEU\":true,\"language\":\"en\",...}")</script>
+        # only some comments are rendered in html
+        # with buttons to "Expand full comment" and "Load More"
+        # see also
+        # https://www.selfpublife.com/p/automatically-expand-all-substack-comments
+        window_preloads = None
+        for script_element in soup.select("script"):
+            script_text = script_element.text.strip()
+            if not script_text.startswith("window._preloads"):
+                continue
+            # pos1 = re.search(r'window._preloads\s*=\s*JSON\.parse\(', script_text).span()[1]
+            pos1 = script_text.find("(") + 1
+            pos2 = script_text.rfind(")")
+            window_preloads = json.loads(json.loads(script_text[pos1:pos2]))
+            break
+        assert window_preloads, f"not found <script>window._preloads...</script> at {url!r}"
+        return window_preloads
+
+    def count_comments(self, comments_preloads):
+
+        def count_comments_inner(comment):
+            res = 1
+            for child_comment in comment["children"]:
+                res += count_comments_inner(child_comment)
+            return res
+
+        res = 0
+        for comment in comments_preloads["initialComments"]:
+            res += count_comments_inner(comment)
+        return res
+
+    def render_comments_html(self, comments_preloads):
+
+        def render_comment_body(body):
+            body = body.strip()
+            body = "<p>" + body + "</p>"
+            body = body.replace("\n", "</p>\n<p>")
+            # TODO more?
+            return body
+
+        def render_comments_html_inner(comment, buf):
+            assert comment["type"] == "comment", f'unexpected comment type: {comment["type"]!r}'
+            buf.write(f'<details class="comment" id="{comment["id"]}" open>\n')
+            buf.write(f'<summary>\n')
+
+            # NOTE user IDs are constant, user handles are variable
+            # when i change my user handle
+            # then other users can use my old user handle
+            buf.write(f'<a class="user" href="https://substack.com/profile/{comment["user_id"]}">')
+            buf.write(comment["name"]) # human-readable username
+            buf.write('</a>\n')
+
+            other_pub = comment["metadata"].get("author_on_other_pub")
+            if other_pub:
+                # NOTE publication handles are quasi-constant:
+                # when i change my publication handle
+                # then other users cannot use my old publication handle
+                # NOTE "Changing your publication's subdomain
+                # does not automatically set up a redirect from the old subdomain to the new one."
+                buf.write(f'(<a class="pub" pub-id="{other_pub["id"]}" href="{other_pub["base_url"]}">')
+                buf.write(other_pub["name"])
+                buf.write('</a>)\n')
+
+            buf.write(comment["date"] + '\n') # "2025-05-17T06:51:39.485Z"
+
+            for reaction, reaction_count in comment["reactions"].items():
+                if reaction_count == 0: continue
+                buf.write(reaction + str(reaction_count) + '\n') # "❤123"
+                # buf.write(str(reaction_count) + reaction + '\n') # "123❤"
+
+            buf.write('</summary>\n')
+
+            buf.write('<blockquote>\n')
+            buf.write('\n')
+            buf.write(render_comment_body(comment["body"]) + '\n')
+
+            for child_comment in comment["children"]:
+                buf.write('\n')
+                render_comments_html_inner(child_comment, buf)
+            buf.write('</blockquote>\n')
+
+            buf.write('</details>\n')
+            buf.write('\n')
+
+        buf = io.StringIO()
+        # NOTE the name "initial" is misleading. all comments are stored in this array
+        # NOTE comments are sorted by likes
+        for comment in comments_preloads["initialComments"]:
+            render_comments_html_inner(comment, buf)
+        return buf.getvalue()
 
     @abstractmethod
     def get_url_soup(self, url: str) -> str:
@@ -412,6 +504,37 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         with tqdm(total=total_images, desc=f"Downloading images for {post_slug}", leave=False) as img_pbar:
                             md = await self.process_markdown_images(md, self.writer_name, post_slug, img_pbar)
 
+                    comments_html = None
+                    comments_num = None
+                    if not self.args.no_comments:
+                        comments_url = url + "/comments"
+                        # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test
+                        comments_soup = await self.get_url_soup(comments_url)
+                        comments_preloads = await self.get_window_preloads(comments_soup)
+                        if 0:
+                            # debug
+                            # TODO add option to write the original "preloads" data to json files
+                            with open("comments_preloads.json", "w") as f:
+                                json.dump(comments_preloads, f, indent=2)
+                            raise 5
+                        comments_num = self.count_comments(comments_preloads)
+                        if comments_num > 0:
+                            comments_html = self.render_comments_html(comments_preloads)
+                            comments_html = (
+                                '\n\n' +
+                                '<hr>\n' +
+                                # this can collide with other elements with id="comments"
+                                # '<section id="comments">\n' +
+                                '<section class="comments">\n' +
+                                '<h2>Comments</h2>\n' +
+                                '<details open>\n' +
+                                f'<summary>{comments_num} comments</summary>\n' +
+                                comments_html + '\n' +
+                                '</details>'
+                                '</section>'
+                            )
+                            md += comments_html
+
                     self.save_to_file(md_filepath, md)
 
                     # Convert markdown to HTML and save
@@ -422,6 +545,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         "title": title,
                         "subtitle": subtitle,
                         "like_count": like_count,
+                        "comment_count": comments_num,
                         "date": date,
                         "file_link": md_filepath,
                         "html_link": html_filepath
@@ -772,6 +896,11 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help=f"Do not download images.",
     )
+    parser.add_argument(
+        "--no-comments", # args.no_comments
+        action="store_true",
+        help=f"Do not download comments.",
+    )
 
     return parser.parse_args()
 

From 591fa86b49e6f38d802c741e3d8b8f10365c2cf8 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 22:37:11 +0100
Subject: [PATCH 09/11] handle removed comments

---
 src/substack2markdown/substack_scraper.py | 32 ++++++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index d2f2ad3a..27890458 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -413,9 +413,19 @@ def render_comments_html_inner(comment, buf):
             # NOTE user IDs are constant, user handles are variable
             # when i change my user handle
             # then other users can use my old user handle
-            buf.write(f'<a class="user" href="https://substack.com/profile/{comment["user_id"]}">')
-            buf.write(comment["name"]) # human-readable username
-            buf.write('</a>\n')
+            if not comment["user_id"] is None:
+                buf.write(f'<a class="user" href="https://substack.com/profile/{comment["user_id"]}">')
+
+            if not comment["name"] is None:
+                buf.write(comment["name"]) # human-readable username
+            else:
+                # Comment removed
+                buf.write("null")
+
+            if not comment["user_id"] is None:
+               buf.write('</a>\n')
+            else:
+               buf.write('\n')
 
             other_pub = comment["metadata"].get("author_on_other_pub")
             if other_pub:
@@ -439,7 +449,21 @@ def render_comments_html_inner(comment, buf):
 
             buf.write('<blockquote>\n')
             buf.write('\n')
-            buf.write(render_comment_body(comment["body"]) + '\n')
+
+            if comment["body"] is None:
+                # Comment removed
+                status = comment.get("status")
+                if status is None:
+                    buf.write('(Comment removed)\n')
+                else:
+                    # "moderator_removed", ...
+                    buf.write('(status:' + status + ')\n')
+                # TODO comment["bans"]
+                # TODO comment["suppressed"]
+                # TODO comment["user_banned"]
+                # TODO comment["user_banned_for_comment"]
+            else:
+                buf.write(render_comment_body(comment["body"]) + '\n')
 
             for child_comment in comment["children"]:
                 buf.write('\n')

From 1458d78ea4828be288c73abdf5462641047813b2 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 22:37:48 +0100
Subject: [PATCH 10/11] add debug comment

---
 src/substack2markdown/substack_scraper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 27890458..2a1c2727 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -578,6 +578,7 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                     print(f"File already exists: {md_filepath}")
             except Exception as e:
                 print(f"Error scraping post: {e}")
+                # raise e # debug
             count += 1
             if num_posts_to_scrape != 0 and count == num_posts_to_scrape:
                 break

From 07e4c1d1581d81d63ff6ac7624da03fa513047d7 Mon Sep 17 00:00:00 2001
From: Milan Hauth <milahu@gmail.com>
Date: Sun, 28 Dec 2025 23:26:21 +0100
Subject: [PATCH 11/11] write JSON files

---
 src/substack2markdown/substack_scraper.py | 35 +++++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/substack2markdown/substack_scraper.py b/src/substack2markdown/substack_scraper.py
index 2a1c2727..d31ca8dd 100644
--- a/src/substack2markdown/substack_scraper.py
+++ b/src/substack2markdown/substack_scraper.py
@@ -31,6 +31,7 @@
 BASE_MD_DIR: str = "substack_md_files"  # Name of the directory we'll save the .md essay files
 BASE_HTML_DIR: str = "substack_html_pages"  # Name of the directory we'll save the .html essay files
 BASE_IMAGE_DIR: str = "substack_images"
+BASE_JSON_DIR: str = "substack_json"
 ASSETS_DIR: str = os.path.dirname(__file__) + "/assets"
 HTML_TEMPLATE: str = "author_template.html"  # HTML template to use for the author page
 JSON_DATA_DIR: str = "data"
@@ -132,6 +133,8 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
         self.md_save_dir: str = md_save_dir
         self.html_save_dir: str = f"{html_save_dir}/{self.writer_name}"
 
+        self.args.json_directory += f"/{self.writer_name}"
+
         if not os.path.exists(md_save_dir):
             os.makedirs(md_save_dir)
             print(f"Created md directory {md_save_dir}")
@@ -142,6 +145,9 @@ def __init__(self, args, base_substack_url: str, md_save_dir: str, html_save_dir
         if not self.args.no_images:
             os.makedirs(self.args.image_directory, exist_ok=True)
 
+        if not self.args.no_json:
+            os.makedirs(self.args.json_directory, exist_ok=True)
+
         self.keywords: List[str] = ["about", "archive", "podcast"]
         self.post_urls: List[str] = self.get_all_post_urls()
 
@@ -535,12 +541,11 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
                         # comments_url = "https://willstorr.substack.com/p/scamming-substack/comments" # test
                         comments_soup = await self.get_url_soup(comments_url)
                         comments_preloads = await self.get_window_preloads(comments_soup)
-                        if 0:
-                            # debug
-                            # TODO add option to write the original "preloads" data to json files
-                            with open("comments_preloads.json", "w") as f:
-                                json.dump(comments_preloads, f, indent=2)
-                            raise 5
+                        if not self.args.no_json:
+                            json_filename = self.get_filename_from_url(url, filetype=".comments.json")
+                            json_filepath = os.path.join(self.args.json_directory, json_filename)
+                            _json = json.dumps(comments_preloads, ensure_ascii=False, separators=(',', ':'))
+                            self.save_to_file(json_filepath, _json)
                         comments_num = self.count_comments(comments_preloads)
                         if comments_num > 0:
                             comments_html = self.render_comments_html(comments_preloads)
@@ -561,6 +566,13 @@ async def scrape_posts(self, num_posts_to_scrape: int = 0) -> None:
 
                     self.save_to_file(md_filepath, md)
 
+                    if not self.args.no_json:
+                        post_preloads = await self.get_window_preloads(soup)
+                        json_filename = self.get_filename_from_url(url, filetype=".post.json")
+                        json_filepath = os.path.join(self.args.json_directory, json_filename)
+                        _json = json.dumps(post_preloads, ensure_ascii=False, separators=(',', ':'))
+                        self.save_to_file(json_filepath, _json)
+
                     # Convert markdown to HTML and save
                     html_content = self.md_to_html(md)
                     self.save_to_html_file(html_filepath, html_content)
@@ -916,6 +928,12 @@ def parse_args() -> argparse.Namespace:
         default=BASE_IMAGE_DIR,
         help=f"The directory to save scraped image files. Default: {BASE_IMAGE_DIR!r}",
     )
+    parser.add_argument(
+        "--json-directory", # args.json_directory
+        type=str,
+        default=BASE_JSON_DIR,
+        help=f"The directory to save scraped JSON files. Default: {BASE_JSON_DIR!r}",
+    )
     parser.add_argument(
         "--no-images", # args.no_images
         action="store_true",
@@ -926,6 +944,11 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help=f"Do not download comments.",
     )
+    parser.add_argument(
+        "--no-json", # args.no_json
+        action="store_true",
+        help=f"Do not write JSON files.",
+    )
 
     return parser.parse_args()