From 4b93aee4a24e2fa029697dbfa7825cf5f24e3aae Mon Sep 17 00:00:00 2001
From: alberba <albertsalomv@gmail.com>
Date: Sun, 11 May 2025 10:28:11 +0200
Subject: [PATCH] =?UTF-8?q?Mejorar=20la=20extracci=C3=B3n=20de=20datos=20e?=
 =?UTF-8?q?n=20el=20scraper=20de=20Substack:=20se=20a=C3=B1ade=20la=20extr?=
 =?UTF-8?q?acci=C3=B3n=20de=20subt=C3=ADtulos=20y=20fechas,=20con=20manejo?=
 =?UTF-8?q?=20de=20errores=20para=20fechas=20no=20encontradas.=20Se=20impl?=
 =?UTF-8?q?ementa=20un=20mecanismo=20de=20reintentos=20para=20la=20obtenci?=
 =?UTF-8?q?=C3=B3n=20de=20p=C3=A1ginas.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 substack_scraper.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/substack_scraper.py b/substack_scraper.py
index e0c92a32..1d683233 100644
--- a/substack_scraper.py
+++ b/substack_scraper.py
@@ -427,15 +427,31 @@ def is_login_failed(self) -> bool:
         error_container = self.driver.find_elements(By.ID, 'error-container')
         return len(error_container) > 0 and error_container[0].is_displayed()
 
-    def get_url_soup(self, url: str) -> BeautifulSoup:
+    def get_url_soup(self, url: str, max_attemps: int = 5) -> BeautifulSoup:
         """
         Gets soup from URL using logged in selenium driver
         """
-        try:
-            self.driver.get(url)
-            return BeautifulSoup(self.driver.page_source, "html.parser")
-        except Exception as e:
-            raise ValueError(f"Error fetching page: {e}") from e
+        for attempt in range(1, max_attemps + 1):
+            try:
+                self.driver.get(url)
+                soup = BeautifulSoup(self.driver.page_source, "html.parser")
+                pre = soup.select_one("body > pre")
+                
+                if pre and "too many requests" in pre.text.lower():
+                    if attempt == max_attemps:
+                        raise RuntimeError(f"Max attempts reached for URL: {url}. Too many requests.")
+                    base = 2 ** attempt
+                    delay = base + random.uniform(-0.2 * base, 0.2 * base)
+                    print(f"[{attempt}/{max_attemps}] Too many requests. Retrying in {delay:.2f} seconds...")
+                    sleep(delay)
+                    continue
+
+                return soup
+            
+            except Exception as e:
+                raise RuntimeError(f"Error fetching page: {url}. Error: {e}")
+            
+        raise RuntimeError(f"Failed to fetch page after {max_attemps} attempts: {url}")
 
 
 def parse_args() -> argparse.Namespace: