diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4c647db..02b0da5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 @@ -27,7 +27,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install -r requirements-dev.txt - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + python -m pip install -r requirements.txt - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -36,11 +36,16 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest and coverage run: | - pytest --cov=scraper --cov-report=xml --cov-report=term-missing --cov-fail-under=70 - - name: Upload coverage report to Codecov + pytest --cov --junitxml=junit.xml --cov-report=term-missing --cov-fail-under=70 + - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: - file: ./coverage.xml - fail_ci_if_error: true token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: true verbose: true + - name: Upload test results to Codecov + if: ${{ !cancelled() }} + uses: codecov/test-results-action@v1 + with: + token: ${{ secrets.CODECOV_TOKEN }} + diff --git a/README.md b/README.md index f36315b..d304ea5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Scraper -[![Python Tests](https://github.com/johnburbridge/scraper/actions/workflows/python-package.yml/badge.svg)](https://github.com/johnburbridge/scraper/actions/workflows/python-package.yml) -[![Coverage](https://codecov.io/gh/johnburbridge/scraper/branch/main/graph/badge.svg)](https://codecov.io/gh/johnburbridge/scraper) +[![Python Tests](https://github.com/spiralhouse/scraper/actions/workflows/python-package.yml/badge.svg)](https://github.com/spiralhouse/scraper/actions/workflows/python-package.yml) +[![Coverage](https://codecov.io/gh/spiralhouse/scraper/branch/main/graph/badge.svg)](https://codecov.io/gh/spiralhouse/scraper) A flexible web crawler that recursively crawls websites, respects robots.txt, and provides various output options. @@ -15,7 +15,7 @@ A flexible web crawler that recursively crawls websites, respects robots.txt, an 1. Clone the repository: ```bash -git clone https://github.com/johnburbridge/scraper.git +git clone https://github.com/spiralhouse/scraper.git cd scraper ``` @@ -30,6 +30,12 @@ source venv/bin/activate # On Windows: venv\Scripts\activate pip install -r requirements.txt ``` +## Requirements + +- **Python**: Compatible with Python 3.9, 3.10, 3.11, and 3.12 +- All dependencies are listed in the `requirements.txt` file and are automatically installed during the installation process. +- Some optional dependencies are available for development in `requirements-dev.txt`. + ## Basic Usage To start crawling a website: diff --git a/docs/develop.md b/docs/develop.md index 842eed5..6ef135b 100644 --- a/docs/develop.md +++ b/docs/develop.md @@ -16,7 +16,7 @@ This guide provides instructions for setting up a development environment, runni 1. Clone the repository: ```bash -git clone https://github.com/johnburbridge/scraper.git +git clone https://github.com/spiralhouse/scraper.git cd scraper ``` diff --git a/requirements.txt b/requirements.txt index 370412f..639a1fd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,8 @@ httpcore==1.0.7 httpx==0.28.1 idna==3.10 ipykernel==6.29.5 -ipython==9.0.2 +ipython>=7.34.0; python_version < '3.11' +ipython==9.0.2; python_version >= '3.11' ipython_pygments_lexers==1.1.1 isoduration==20.11.0 jedi==0.19.2 diff --git a/scripts/check_py_compat.py b/scripts/check_py_compat.py new file mode 100755 index 0000000..e16f1c9 --- /dev/null +++ b/scripts/check_py_compat.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +""" +Script to verify Python version compatibility with project requirements. +This helps identify packages that might not be compatible with specific Python versions. +""" + +import sys +import subprocess +import tempfile +import os +import platform +from pathlib import Path + +def check_requirements(requirements_file): + """Test if all packages in the requirements file can be installed""" + print(f"Checking compatibility of {requirements_file} with Python {platform.python_version()}") + + with tempfile.TemporaryDirectory() as tmpdir: + # Create a virtual environment in the temp directory + venv_dir = os.path.join(tmpdir, "venv") + subprocess.run([sys.executable, "-m", "venv", venv_dir], check=True) + + # Determine pip path + if sys.platform.startswith('win'): + pip_path = os.path.join(venv_dir, "Scripts", "pip") + else: + pip_path = os.path.join(venv_dir, "bin", "pip") + + # Upgrade pip + subprocess.run([pip_path, "install", "--upgrade", "pip"], check=True) + + # Test installing the requirements + try: + subprocess.run( + [pip_path, "install", "-r", requirements_file], + check=True, + capture_output=True, + text=True + ) + print(f"✅ All packages in {requirements_file} are compatible with Python {platform.python_version()}") + return True + except subprocess.CalledProcessError as e: + print(f"❌ Some packages in {requirements_file} are NOT compatible with Python {platform.python_version()}") + print("Error details:") + print(e.stdout) + print(e.stderr) + return False + +def main(): + """Main function""" + proj_root = Path(__file__).parent.parent + + # Check both requirements files + req_files = [ + proj_root / "requirements.txt", + proj_root / "requirements-dev.txt" + ] + + success = True + for req_file in req_files: + if req_file.exists(): + if not check_requirements(req_file): + success = False + + return 0 if success else 1 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/tests/test_crawler.py b/tests/test_crawler.py index 6f0eb70..4f12de3 100644 --- a/tests/test_crawler.py +++ b/tests/test_crawler.py @@ -11,13 +11,22 @@ def async_run(coro): """Helper function to run coroutines in tests with a fresh event loop.""" - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + try: + # Try to get an existing event loop + loop = asyncio.get_event_loop() + if loop.is_closed(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + except RuntimeError: + # Create a new event loop if there isn't one + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: return loop.run_until_complete(coro) finally: - loop.close() - asyncio.set_event_loop(None) + # Clean up but don't close the loop as it might be reused + pass class TestCrawler(unittest.TestCase): @@ -25,6 +34,12 @@ class TestCrawler(unittest.TestCase): def setUp(self): """Set up test fixtures.""" + # Create and set an event loop for Python 3.9 compatibility + try: + asyncio.get_event_loop() + except RuntimeError: + asyncio.set_event_loop(asyncio.new_event_loop()) + self.crawler = Crawler( max_depth=2, concurrency_limit=5, @@ -35,6 +50,20 @@ def setUp(self): def tearDown(self): """Clean up after tests.""" self.crawler.close() + # Reset the event loop for next test + try: + # Get the current event loop + loop = asyncio.get_event_loop() + # If the loop is running, stop it + if loop.is_running(): + loop.stop() + # Close it + loop.close() + except RuntimeError: + pass # No event loop exists + finally: + # Reset to None to clean up + asyncio.set_event_loop(None) def test_is_allowed_domain_same_domain(self): """Test that same domain is always allowed."""