Skip to content
Merged
17 changes: 11 additions & 6 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.11", "3.12"]
python-version: ["3.9", "3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v4
Expand All @@ -27,7 +27,7 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements-dev.txt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
python -m pip install -r requirements.txt
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand All @@ -36,11 +36,16 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest and coverage
run: |
pytest --cov=scraper --cov-report=xml --cov-report=term-missing --cov-fail-under=70
- name: Upload coverage report to Codecov
pytest --cov --junitxml=junit.xml --cov-report=term-missing --cov-fail-under=70
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
file: ./coverage.xml
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }}
fail_ci_if_error: true
verbose: true
- name: Upload test results to Codecov
if: ${{ !cancelled() }}
uses: codecov/test-results-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}

12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Scraper

[![Python Tests](https://github.com/johnburbridge/scraper/actions/workflows/python-package.yml/badge.svg)](https://github.com/johnburbridge/scraper/actions/workflows/python-package.yml)
[![Coverage](https://codecov.io/gh/johnburbridge/scraper/branch/main/graph/badge.svg)](https://codecov.io/gh/johnburbridge/scraper)
[![Python Tests](https://github.com/spiralhouse/scraper/actions/workflows/python-package.yml/badge.svg)](https://github.com/spiralhouse/scraper/actions/workflows/python-package.yml)
[![Coverage](https://codecov.io/gh/spiralhouse/scraper/branch/main/graph/badge.svg)](https://codecov.io/gh/spiralhouse/scraper)

A flexible web crawler that recursively crawls websites, respects robots.txt, and provides various output options.

Expand All @@ -15,7 +15,7 @@ A flexible web crawler that recursively crawls websites, respects robots.txt, an

1. Clone the repository:
```bash
git clone https://github.com/johnburbridge/scraper.git
git clone https://github.com/spiralhouse/scraper.git
cd scraper
```

Expand All @@ -30,6 +30,12 @@ source venv/bin/activate # On Windows: venv\Scripts\activate
pip install -r requirements.txt
```

## Requirements

- **Python**: Compatible with Python 3.9, 3.10, 3.11, and 3.12
- All dependencies are listed in the `requirements.txt` file and are automatically installed during the installation process.
- Some optional dependencies are available for development in `requirements-dev.txt`.

## Basic Usage

To start crawling a website:
Expand Down
2 changes: 1 addition & 1 deletion docs/develop.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This guide provides instructions for setting up a development environment, runni

1. Clone the repository:
```bash
git clone https://github.com/johnburbridge/scraper.git
git clone https://github.com/spiralhouse/scraper.git
cd scraper
```

Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ httpcore==1.0.7
httpx==0.28.1
idna==3.10
ipykernel==6.29.5
ipython==9.0.2
ipython>=7.34.0; python_version < '3.11'
ipython==9.0.2; python_version >= '3.11'
ipython_pygments_lexers==1.1.1
isoduration==20.11.0
jedi==0.19.2
Expand Down
68 changes: 68 additions & 0 deletions scripts/check_py_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python3
"""
Script to verify Python version compatibility with project requirements.
This helps identify packages that might not be compatible with specific Python versions.
"""

import sys
import subprocess
import tempfile
import os
import platform
from pathlib import Path

def check_requirements(requirements_file):
"""Test if all packages in the requirements file can be installed"""
print(f"Checking compatibility of {requirements_file} with Python {platform.python_version()}")

with tempfile.TemporaryDirectory() as tmpdir:
# Create a virtual environment in the temp directory
venv_dir = os.path.join(tmpdir, "venv")
subprocess.run([sys.executable, "-m", "venv", venv_dir], check=True)

# Determine pip path
if sys.platform.startswith('win'):
pip_path = os.path.join(venv_dir, "Scripts", "pip")
else:
pip_path = os.path.join(venv_dir, "bin", "pip")

# Upgrade pip
subprocess.run([pip_path, "install", "--upgrade", "pip"], check=True)

# Test installing the requirements
try:
subprocess.run(
[pip_path, "install", "-r", requirements_file],
check=True,
capture_output=True,
text=True
)
print(f"✅ All packages in {requirements_file} are compatible with Python {platform.python_version()}")
return True
except subprocess.CalledProcessError as e:
print(f"❌ Some packages in {requirements_file} are NOT compatible with Python {platform.python_version()}")
print("Error details:")
print(e.stdout)
print(e.stderr)
return False

def main():
"""Main function"""
proj_root = Path(__file__).parent.parent

# Check both requirements files
req_files = [
proj_root / "requirements.txt",
proj_root / "requirements-dev.txt"
]

success = True
for req_file in req_files:
if req_file.exists():
if not check_requirements(req_file):
success = False

return 0 if success else 1

if __name__ == "__main__":
sys.exit(main())
37 changes: 33 additions & 4 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,35 @@

def async_run(coro):
"""Helper function to run coroutines in tests with a fresh event loop."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Try to get an existing event loop
loop = asyncio.get_event_loop()
if loop.is_closed():
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
except RuntimeError:
# Create a new event loop if there isn't one
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

try:
return loop.run_until_complete(coro)
finally:
loop.close()
asyncio.set_event_loop(None)
# Clean up but don't close the loop as it might be reused
pass


class TestCrawler(unittest.TestCase):
"""Tests for the Crawler class."""

def setUp(self):
"""Set up test fixtures."""
# Create and set an event loop for Python 3.9 compatibility
try:
asyncio.get_event_loop()
except RuntimeError:
asyncio.set_event_loop(asyncio.new_event_loop())

self.crawler = Crawler(
max_depth=2,
concurrency_limit=5,
Expand All @@ -35,6 +50,20 @@ def setUp(self):
def tearDown(self):
"""Clean up after tests."""
self.crawler.close()
# Reset the event loop for next test
try:
# Get the current event loop
loop = asyncio.get_event_loop()
# If the loop is running, stop it
if loop.is_running():
loop.stop()
# Close it
loop.close()
except RuntimeError:
pass # No event loop exists
finally:
# Reset to None to clean up
asyncio.set_event_loop(None)

def test_is_allowed_domain_same_domain(self):
"""Test that same domain is always allowed."""
Expand Down