From 1c5fd86b978f689aca9c31b4d7a0a4e27575ae75 Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Tue, 18 Mar 2025 17:28:18 -0700 Subject: [PATCH 1/7] Update repository references from johnburbridge/scraper to spiralhouse/scraper --- README.md | 6 +++--- docs/develop.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f36315b..924d964 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Scraper -[![Python Tests](https://github.com/johnburbridge/scraper/actions/workflows/python-package.yml/badge.svg)](https://github.com/johnburbridge/scraper/actions/workflows/python-package.yml) -[![Coverage](https://codecov.io/gh/johnburbridge/scraper/branch/main/graph/badge.svg)](https://codecov.io/gh/johnburbridge/scraper) +[![Python Tests](https://github.com/spiralhouse/scraper/actions/workflows/python-package.yml/badge.svg)](https://github.com/spiralhouse/scraper/actions/workflows/python-package.yml) +[![Coverage](https://codecov.io/gh/spiralhouse/scraper/branch/main/graph/badge.svg)](https://codecov.io/gh/spiralhouse/scraper) A flexible web crawler that recursively crawls websites, respects robots.txt, and provides various output options. @@ -15,7 +15,7 @@ A flexible web crawler that recursively crawls websites, respects robots.txt, an 1. Clone the repository: ```bash -git clone https://github.com/johnburbridge/scraper.git +git clone https://github.com/spiralhouse/scraper.git cd scraper ``` diff --git a/docs/develop.md b/docs/develop.md index 842eed5..6ef135b 100644 --- a/docs/develop.md +++ b/docs/develop.md @@ -16,7 +16,7 @@ This guide provides instructions for setting up a development environment, runni 1. Clone the repository: ```bash -git clone https://github.com/johnburbridge/scraper.git +git clone https://github.com/spiralhouse/scraper.git cd scraper ``` From 2bc814731ebcd4881ac703570ddb248163982085 Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Wed, 19 Mar 2025 12:36:10 -0700 Subject: [PATCH 2/7] ci: fix CODECOV configuration --- .github/workflows/python-package.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4c647db..7eaed3b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -37,10 +37,11 @@ jobs: - name: Test with pytest and coverage run: | pytest --cov=scraper --cov-report=xml --cov-report=term-missing --cov-fail-under=70 - - name: Upload coverage report to Codecov + - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: spiralhouse/scraper file: ./coverage.xml fail_ci_if_error: true - token: ${{ secrets.CODECOV_TOKEN }} verbose: true From 48ed4cc1f3f318570f7c629239f0763f36145737 Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Wed, 19 Mar 2025 12:40:59 -0700 Subject: [PATCH 3/7] ci: added python 3.9 and 3.10 to tests --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4c647db..539f382 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 From 25d27d33a42ae260b9406d09036225c4876c789c Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Wed, 19 Mar 2025 12:48:59 -0700 Subject: [PATCH 4/7] fix(compat): fix Python compatibility issues for 3.9 and 3.10 - Update ipython dependency to use compatible versions based on Python version\n- Update GitHub Actions workflow to properly handle conditional requirements\n- Add compatibility checking script --- .github/workflows/python-package.yml | 4 +- requirements.txt | 3 +- scripts/check_py_compat.py | 68 ++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-) create mode 100755 scripts/check_py_compat.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 539f382..1108f21 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -r requirements-dev.txt - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m pip install -r requirements.txt - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -38,7 +38,7 @@ jobs: run: | pytest --cov=scraper --cov-report=xml --cov-report=term-missing --cov-fail-under=70 - name: Upload coverage report to Codecov - uses: codecov/codecov-action@v5 + uses: codecov/codecov-action@v3 with: file: ./coverage.xml fail_ci_if_error: true diff --git a/requirements.txt b/requirements.txt index 370412f..639a1fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,8 @@ httpcore==1.0.7 httpx==0.28.1 idna==3.10 ipykernel==6.29.5 -ipython==9.0.2 +ipython>=7.34.0; python_version < '3.11' +ipython==9.0.2; python_version >= '3.11' ipython_pygments_lexers==1.1.1 isoduration==20.11.0 jedi==0.19.2 diff --git a/scripts/check_py_compat.py b/scripts/check_py_compat.py new file mode 100755 index 0000000..e16f1c9 --- /dev/null +++ b/scripts/check_py_compat.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +""" +Script to verify Python version compatibility with project requirements. +This helps identify packages that might not be compatible with specific Python versions. +""" + +import sys +import subprocess +import tempfile +import os +import platform +from pathlib import Path + +def check_requirements(requirements_file): + """Test if all packages in the requirements file can be installed""" + print(f"Checking compatibility of {requirements_file} with Python {platform.python_version()}") + + with tempfile.TemporaryDirectory() as tmpdir: + # Create a virtual environment in the temp directory + venv_dir = os.path.join(tmpdir, "venv") + subprocess.run([sys.executable, "-m", "venv", venv_dir], check=True) + + # Determine pip path + if sys.platform.startswith('win'): + pip_path = os.path.join(venv_dir, "Scripts", "pip") + else: + pip_path = os.path.join(venv_dir, "bin", "pip") + + # Upgrade pip + subprocess.run([pip_path, "install", "--upgrade", "pip"], check=True) + + # Test installing the requirements + try: + subprocess.run( + [pip_path, "install", "-r", requirements_file], + check=True, + capture_output=True, + text=True + ) + print(f"✅ All packages in {requirements_file} are compatible with Python {platform.python_version()}") + return True + except subprocess.CalledProcessError as e: + print(f"❌ Some packages in {requirements_file} are NOT compatible with Python {platform.python_version()}") + print("Error details:") + print(e.stdout) + print(e.stderr) + return False + +def main(): + """Main function""" + proj_root = Path(__file__).parent.parent + + # Check both requirements files + req_files = [ + proj_root / "requirements.txt", + proj_root / "requirements-dev.txt" + ] + + success = True + for req_file in req_files: + if req_file.exists(): + if not check_requirements(req_file): + success = False + + return 0 if success else 1 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file From 0bd9d4c92316cd810cb017a4272791ebc952fdc8 Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Wed, 19 Mar 2025 12:58:56 -0700 Subject: [PATCH 5/7] fix(compat): resolve asyncio event loop issues in Python 3.9 tests Fixes asyncio event loop handling in test_crawler.py to ensure compatibility with Python 3.9 --- tests/test_crawler.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 6f0eb70..4f12de3 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -11,13 +11,22 @@ def async_run(coro): """Helper function to run coroutines in tests with a fresh event loop.""" - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + try: + # Try to get an existing event loop + loop = asyncio.get_event_loop() + if loop.is_closed(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + except RuntimeError: + # Create a new event loop if there isn't one + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: return loop.run_until_complete(coro) finally: - loop.close() - asyncio.set_event_loop(None) + # Clean up but don't close the loop as it might be reused + pass class TestCrawler(unittest.TestCase): @@ -25,6 +34,12 @@ class TestCrawler(unittest.TestCase): def setUp(self): """Set up test fixtures.""" + # Create and set an event loop for Python 3.9 compatibility + try: + asyncio.get_event_loop() + except RuntimeError: + asyncio.set_event_loop(asyncio.new_event_loop()) + self.crawler = Crawler( max_depth=2, concurrency_limit=5, @@ -35,6 +50,20 @@ def setUp(self): def tearDown(self): """Clean up after tests.""" self.crawler.close() + # Reset the event loop for next test + try: + # Get the current event loop + loop = asyncio.get_event_loop() + # If the loop is running, stop it + if loop.is_running(): + loop.stop() + # Close it + loop.close() + except RuntimeError: + pass # No event loop exists + finally: + # Reset to None to clean up + asyncio.set_event_loop(None) def test_is_allowed_domain_same_domain(self): """Test that same domain is always allowed.""" From 84e7ed6791f9087e3eb6401fab5045083a92e86a Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Wed, 19 Mar 2025 13:03:17 -0700 Subject: [PATCH 6/7] docs: add Python compatibility information to README Add requirements section with supported Python versions (3.9-3.12) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 924d964..d304ea5 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,12 @@ source venv/bin/activate # On Windows: venv\Scripts\activate pip install -r requirements.txt ``` +## Requirements + +- **Python**: Compatible with Python 3.9, 3.10, 3.11, and 3.12 +- All dependencies are listed in the `requirements.txt` file and are automatically installed during the installation process. +- Some optional dependencies are available for development in `requirements-dev.txt`. + ## Basic Usage To start crawling a website: From c0dfbdd725c6c2e3cd01d0b35297dada9dd06177 Mon Sep 17 00:00:00 2001 From: John Burbridge Date: Wed, 19 Mar 2025 13:10:53 -0700 Subject: [PATCH 7/7] ci: added codecov/test-results-action@v1 --- .github/workflows/python-package.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a615099..02b0da5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -36,12 +36,16 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest and coverage run: | - pytest --cov=scraper --cov-report=xml --cov-report=term-missing --cov-fail-under=70 - - name: Upload coverage reports to Codecov + pytest --cov --junitxml=junit.xml --cov-report=term-missing --cov-fail-under=70 + - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - slug: spiralhouse/scraper - file: ./coverage.xml fail_ci_if_error: true verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} +