From 4b93aee4a24e2fa029697dbfa7825cf5f24e3aae Mon Sep 17 00:00:00 2001 From: alberba Date: Sun, 11 May 2025 10:28:11 +0200 Subject: [PATCH] =?UTF-8?q?Mejorar=20la=20extracci=C3=B3n=20de=20datos=20e?= =?UTF-8?q?n=20el=20scraper=20de=20Substack:=20se=20a=C3=B1ade=20la=20extr?= =?UTF-8?q?acci=C3=B3n=20de=20subt=C3=ADtulos=20y=20fechas,=20con=20manejo?= =?UTF-8?q?=20de=20errores=20para=20fechas=20no=20encontradas.=20Se=20impl?= =?UTF-8?q?ementa=20un=20mecanismo=20de=20reintentos=20para=20la=20obtenci?= =?UTF-8?q?=C3=B3n=20de=20p=C3=A1ginas.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- substack_scraper.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/substack_scraper.py b/substack_scraper.py index e0c92a32..1d683233 100644 --- a/substack_scraper.py +++ b/substack_scraper.py @@ -427,15 +427,31 @@ def is_login_failed(self) -> bool: error_container = self.driver.find_elements(By.ID, 'error-container') return len(error_container) > 0 and error_container[0].is_displayed() - def get_url_soup(self, url: str) -> BeautifulSoup: + def get_url_soup(self, url: str, max_attemps: int = 5) -> BeautifulSoup: """ Gets soup from URL using logged in selenium driver """ - try: - self.driver.get(url) - return BeautifulSoup(self.driver.page_source, "html.parser") - except Exception as e: - raise ValueError(f"Error fetching page: {e}") from e + for attempt in range(1, max_attemps + 1): + try: + self.driver.get(url) + soup = BeautifulSoup(self.driver.page_source, "html.parser") + pre = soup.select_one("body > pre") + + if pre and "too many requests" in pre.text.lower(): + if attempt == max_attemps: + raise RuntimeError(f"Max attempts reached for URL: {url}. Too many requests.") + base = 2 ** attempt + delay = base + random.uniform(-0.2 * base, 0.2 * base) + print(f"[{attempt}/{max_attemps}] Too many requests. Retrying in {delay:.2f} seconds...") + sleep(delay) + continue + + return soup + + except Exception as e: + raise RuntimeError(f"Error fetching page: {url}. Error: {e}") + + raise RuntimeError(f"Failed to fetch page after {max_attemps} attempts: {url}") def parse_args() -> argparse.Namespace: