diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..dbd2967 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,76 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: Lint with ruff + run: | + ruff check src tests + + - name: Check formatting with black + run: | + black --check src tests + + - name: Type check with mypy + run: | + mypy src --ignore-missing-imports + + - name: Run tests with pytest + run: | + pytest tests/ -v --cov=tablediff_arrow --cov-report=xml + + - name: Upload coverage to Codecov + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11' + uses: codecov/codecov-action@v4 + with: + file: ./coverage.xml + fail_ci_if_error: false + + build: + runs-on: ubuntu-latest + needs: test + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build package + run: python -m build + + - name: Check package + run: | + pip install twine + twine check dist/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d08654b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,31 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + - id: check-json + - id: check-toml + - id: check-merge-conflict + - id: debug-statements + + - repo: https://github.com/psf/black + rev: 23.12.1 + hooks: + - id: black + language_version: python3.10 + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.11 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + additional_dependencies: [types-all] + args: [--ignore-missing-imports] diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..df6bb40 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,36 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.0] - 2025-10-13 + +### Added +- Initial release of tablediff-arrow +- Core table comparison functionality with keyed comparisons +- Support for Parquet, CSV, and Arrow IPC file formats +- Support for local and S3 file paths +- Numeric tolerances (absolute and relative) for comparisons +- HTML report generation with styled output +- CSV report generation with separate files for changes, left-only, and right-only rows +- Command-line interface (CLI) with comprehensive options +- Python library API for programmatic use +- Comprehensive test suite with 86% coverage +- Pre-commit hooks for code quality +- GitHub Actions CI workflow for automated testing +- Support for Python 3.10+ +- MIT License +- Documentation and examples + +### Features +- **Fast Performance**: Built on Apache Arrow for efficient data processing +- **Multiple Formats**: Parquet, CSV, and Arrow IPC support +- **S3 Support**: Optional S3 filesystem integration +- **Flexible Comparisons**: Single or multiple key columns +- **Numeric Tolerances**: Configure absolute and relative tolerances per column +- **Rich Reports**: Generate HTML and CSV reports with detailed differences +- **CLI and Library**: Use as a command-line tool or Python library + +[0.1.0]: https://github.com/psmman/tablediff-arrow/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e7d6e5c --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,203 @@ +# Contributing to tablediff-arrow + +Thank you for your interest in contributing to tablediff-arrow! This document provides guidelines and instructions for contributing to the project. + +## Getting Started + +### Prerequisites + +- Python 3.10 or higher +- Git +- pip + +### Setting Up Development Environment + +1. Fork and clone the repository: + ```bash + git clone https://github.com/YOUR_USERNAME/tablediff-arrow.git + cd tablediff-arrow + ``` + +2. Install the package in development mode with all dependencies: + ```bash + pip install -e ".[dev]" + ``` + +3. Install pre-commit hooks: + ```bash + pre-commit install + ``` + +## Development Workflow + +### Running Tests + +Run all tests: +```bash +pytest +``` + +Run tests with coverage: +```bash +pytest --cov=tablediff_arrow --cov-report=html +``` + +Run specific test file: +```bash +pytest tests/test_compare.py +``` + +Run specific test: +```bash +pytest tests/test_compare.py::test_identical_tables +``` + +### Code Quality + +#### Formatting + +Format code with Black: +```bash +black src tests +``` + +#### Linting + +Lint code with Ruff: +```bash +ruff check src tests +``` + +Fix auto-fixable issues: +```bash +ruff check --fix src tests +``` + +#### Type Checking + +Run type checking with mypy: +```bash +mypy src --ignore-missing-imports +``` + +### Pre-commit Hooks + +Pre-commit hooks run automatically when you commit. To run manually: +```bash +pre-commit run --all-files +``` + +## Making Changes + +### Branch Naming + +Use descriptive branch names: +- `feature/add-new-format-support` +- `fix/handle-nan-values` +- `docs/update-examples` + +### Commit Messages + +Follow conventional commit format: +- `feat: add support for JSON format` +- `fix: handle NaN values in comparisons` +- `docs: update README with new examples` +- `test: add tests for S3 functionality` +- `refactor: simplify comparison logic` + +### Pull Request Process + +1. Create a new branch for your changes +2. Make your changes and add tests +3. Ensure all tests pass and code is properly formatted +4. Update documentation if needed +5. Push your branch and create a pull request +6. Wait for review and address any feedback + +## Code Style Guidelines + +### Python Style + +- Follow PEP 8 +- Use type hints where appropriate +- Maximum line length: 100 characters +- Use meaningful variable and function names + +### Documentation + +- Add docstrings to all public functions and classes +- Use Google-style docstrings +- Update README for user-facing changes +- Add examples for new features + +### Testing + +- Write tests for all new features +- Maintain or improve test coverage +- Use pytest fixtures for test data +- Test edge cases and error conditions + +## Project Structure + +``` +tablediff-arrow/ +├── src/ +│ └── tablediff_arrow/ +│ ├── __init__.py # Package initialization +│ ├── cli.py # Command-line interface +│ ├── compare.py # Core comparison logic +│ ├── loader.py # Data loading utilities +│ └── reports.py # Report generation +├── tests/ +│ ├── test_cli.py # CLI tests +│ ├── test_compare.py # Comparison tests +│ ├── test_loader.py # Loader tests +│ └── test_reports.py # Report tests +├── .github/ +│ └── workflows/ +│ └── ci.yml # CI/CD configuration +├── pyproject.toml # Project configuration +└── README.md # Project documentation +``` + +## Adding New Features + +### Adding a New File Format + +1. Update `loader.py` to handle the new format +2. Add tests in `tests/test_loader.py` +3. Update CLI to support format selection +4. Update documentation + +### Adding a New Report Format + +1. Create a new function in `reports.py` +2. Add tests in `tests/test_reports.py` +3. Update CLI to support new format +4. Update documentation + +## Debugging + +### Running with Debug Output + +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +### Interactive Testing + +```bash +python -i examples.py +``` + +## Getting Help + +- Create an issue for bugs or feature requests +- Check existing issues before creating new ones +- Provide detailed information in issues +- Be respectful and constructive + +## License + +By contributing to tablediff-arrow, you agree that your contributions will be licensed under the MIT License. diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..d925bfe --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,161 @@ +# Quick Start Guide + +Get started with tablediff-arrow in minutes! + +## Installation + +```bash +pip install tablediff-arrow +``` + +## 5-Minute Tutorial + +### 1. Compare Two Parquet Files + +```bash +tablediff data1.parquet data2.parquet -k id +``` + +### 2. Generate an HTML Report + +```bash +tablediff data1.parquet data2.parquet -k id -o report.html +``` + +Open `report.html` in your browser to see a beautiful, styled comparison report. + +### 3. Use in Python + +```python +from tablediff_arrow import TableDiff +import pyarrow.parquet as pq + +# Load your data +left = pq.read_table("data1.parquet") +right = pq.read_table("data2.parquet") + +# Compare +differ = TableDiff(key_columns=["id"]) +result = differ.compare_tables(left, right) + +# Print summary +print(result.summary()) +``` + +### 4. Handle Numeric Differences + +```bash +# Allow 1% tolerance for price differences +tablediff sales_old.csv sales_new.csv -k id -r price:0.01 +``` + +### 5. Compare CSV Files + +```bash +tablediff left.csv right.csv -k customer_id --csv-output reports/ +``` + +This generates multiple CSV files: +- `reports/diff_summary.csv` - Summary statistics +- `reports/diff_changes.csv` - Detailed changes +- `reports/diff_left_only.csv` - Rows only in left +- `reports/diff_right_only.csv` - Rows only in right + +## Common Use Cases + +### Multiple Key Columns + +```bash +tablediff data1.parquet data2.parquet -k year -k month -k product +``` + +### Absolute Tolerance + +```bash +# Allow differences up to 0.01 in the amount column +tablediff data1.csv data2.csv -k id -t amount:0.01 +``` + +### S3 Files + +```bash +pip install tablediff-arrow[s3] +tablediff s3://bucket/data1.parquet s3://bucket/data2.parquet -k id --s3 +``` + +## Python API Examples + +### Basic Comparison + +```python +from tablediff_arrow import TableDiff +import pyarrow as pa + +left = pa.table({'id': [1, 2, 3], 'value': [10, 20, 30]}) +right = pa.table({'id': [1, 2, 3], 'value': [10, 21, 30]}) + +differ = TableDiff(key_columns=['id']) +result = differ.compare_tables(left, right) + +print(f"Changed rows: {result.changed_rows}") +print(f"Total changes: {result.total_changes}") +``` + +### With Tolerances + +```python +differ = TableDiff( + key_columns=['id'], + tolerance={'amount': 0.01}, # Absolute tolerance + relative_tolerance={'rate': 0.001} # Relative tolerance (0.1%) +) +result = differ.compare_tables(left, right) +``` + +### Generate Reports + +```python +from tablediff_arrow.reports import generate_html_report, generate_csv_report + +# HTML report +generate_html_report(result, 'comparison.html') + +# CSV reports +generate_csv_report(result, 'output/', prefix='diff') +``` + +## Next Steps + +- Read the full [README](README.md) for more details +- Check out [examples.py](examples.py) for comprehensive examples +- Read [CONTRIBUTING.md](CONTRIBUTING.md) to contribute +- See [CHANGELOG.md](CHANGELOG.md) for version history + +## Getting Help + +- Create an issue on GitHub: https://github.com/psmman/tablediff-arrow/issues +- Read the documentation in the README +- Check the examples file for common patterns + +## Quick Reference + +### CLI Options + +``` +-k, --key Key column(s) for comparison (required, repeatable) +-t, --tolerance Absolute tolerance (format: column:value) +-r, --relative-tol Relative tolerance (format: column:value) +-o, --output HTML report output path +--csv-output Directory for CSV reports +--left-format Force format for left file (parquet/csv/arrow) +--right-format Force format for right file (parquet/csv/arrow) +--s3 Enable S3 filesystem support +``` + +### Exit Codes + +- `0` - Tables match (no differences) +- `1` - Differences found +- `2` - Error occurred + +Happy diffing! 🚀 diff --git a/README.md b/README.md index 7ff5d7e..78e275b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,285 @@ # tablediff-arrow + Fast, file-based diffs for Parquet/CSV/Arrow (local or S3) with keyed comparisons, per-column tolerances, and HTML/CSV reports—built on Apache Arrow. + +[](https://github.com/psmman/tablediff-arrow/actions) +[](https://www.python.org/downloads/) +[](https://opensource.org/licenses/MIT) + +## Features + +- 🚀 **Fast**: Built on Apache Arrow for high-performance data processing +- 📁 **Multiple Formats**: Support for Parquet, CSV, and Arrow IPC files +- ☁️ **S3 Support**: Read files directly from S3 (optional) +- 🔑 **Keyed Comparisons**: Compare tables using one or more key columns +- 🎯 **Numeric Tolerances**: Configure absolute and relative tolerances for numeric columns +- 📊 **Rich Reports**: Generate HTML and CSV reports with detailed differences +- 🐍 **Python 3.10+**: Modern Python with type hints and clean APIs +- 🧪 **Well Tested**: Comprehensive test suite with high coverage + +## Installation + +```bash +pip install tablediff-arrow +``` + +For S3 support: + +```bash +pip install tablediff-arrow[s3] +``` + +For development: + +```bash +pip install -e ".[dev]" +``` + +## Quick Start + +### Command Line Interface + +Compare two Parquet files using `id` as the key column: + +```bash +tablediff left.parquet right.parquet -k id +``` + +Compare with numeric tolerance: + +```bash +tablediff left.csv right.csv -k id -t amount:0.01 +``` + +Generate an HTML report: + +```bash +tablediff left.parquet right.parquet -k id -o report.html +``` + +Compare S3 files: + +```bash +tablediff s3://bucket/left.parquet s3://bucket/right.parquet -k id --s3 +``` + +### Python API + +```python +from tablediff_arrow import TableDiff + +# Create a differ with key columns and tolerances +differ = TableDiff( + key_columns=['id'], + tolerance={'amount': 0.01}, # Absolute tolerance + relative_tolerance={'price': 0.001} # Relative tolerance (0.1%) +) + +# Compare files +result = differ.compare_files('left.parquet', 'right.parquet') + +# Print summary +print(result.summary()) + +# Check if there are differences +if result.has_differences: + print(f"Found {result.changed_rows} changed rows") + print(f"Found {result.left_only_rows} rows only in left") + print(f"Found {result.right_only_rows} rows only in right") + +# Generate reports +from tablediff_arrow.reports import generate_html_report, generate_csv_report + +generate_html_report(result, 'report.html') +generate_csv_report(result, 'output_dir/', prefix='diff') +``` + +## Usage Examples + +### Multiple Key Columns + +Compare tables using composite keys: + +```bash +tablediff left.parquet right.parquet -k year -k month -k product +``` + +```python +differ = TableDiff(key_columns=['year', 'month', 'product']) +result = differ.compare_files('left.parquet', 'right.parquet') +``` + +### Numeric Tolerances + +Use absolute tolerance for monetary values: + +```bash +tablediff left.csv right.csv -k id -t amount:0.01 -t balance:0.001 +``` + +Use relative tolerance for percentages: + +```bash +tablediff left.csv right.csv -k id -r rate:0.001 -r score:0.01 +``` + +```python +differ = TableDiff( + key_columns=['id'], + tolerance={'amount': 0.01, 'balance': 0.001}, + relative_tolerance={'rate': 0.001, 'score': 0.01} +) +``` + +### Working with PyArrow Tables + +```python +import pyarrow as pa +from tablediff_arrow import TableDiff + +# Create tables directly +left = pa.table({'id': [1, 2, 3], 'value': [10, 20, 30]}) +right = pa.table({'id': [1, 2, 3], 'value': [10, 21, 30]}) + +# Compare +differ = TableDiff(key_columns=['id']) +result = differ.compare_tables(left, right) + +print(result.summary()) +``` + +### S3 Files + +```python +import s3fs +from tablediff_arrow import TableDiff + +# Create S3 filesystem +fs = s3fs.S3FileSystem() + +# Compare S3 files +differ = TableDiff(key_columns=['id']) +result = differ.compare_files( + 's3://my-bucket/left.parquet', + 's3://my-bucket/right.parquet', + filesystem=fs +) +``` + +## CLI Options + +``` +Usage: tablediff [OPTIONS] LEFT RIGHT + + Compare two tables and generate diff reports. + +Arguments: + LEFT Path to the left/source table file (local or s3://) + RIGHT Path to the right/target table file (local or s3://) + +Options: + -k, --key TEXT Key column(s) for comparison (required, can be + specified multiple times) + -t, --tolerance TEXT Absolute tolerance for numeric columns + (format: column:value) + -r, --relative-tolerance Relative tolerance for numeric columns + (format: column:value) + --left-format [parquet|csv|arrow] + Format of the left file + --right-format [parquet|csv|arrow] + Format of the right file + -o, --output TEXT Output file path for HTML report + --csv-output PATH Output directory for CSV reports + --s3 Enable S3 filesystem support + --help Show this message and exit. +``` + +## Output Reports + +### HTML Report + +The HTML report provides an interactive view of differences: + +- Summary statistics (matched, changed, added, removed rows) +- Color-coded differences table +- Separate sections for left-only and right-only rows +- Change counts per column + +### CSV Reports + +CSV output generates multiple files: + +- `{prefix}_summary.csv`: Summary statistics +- `{prefix}_changes.csv`: Detailed changes with old and new values +- `{prefix}_left_only.csv`: Rows only in the left table +- `{prefix}_right_only.csv`: Rows only in the right table + +## Development + +### Setup + +```bash +# Clone the repository +git clone https://github.com/psmman/tablediff-arrow.git +cd tablediff-arrow + +# Install with development dependencies +pip install -e ".[dev]" + +# Install pre-commit hooks +pre-commit install +``` + +### Running Tests + +```bash +# Run all tests +pytest + +# Run with coverage +pytest --cov=tablediff_arrow --cov-report=html + +# Run specific test file +pytest tests/test_compare.py +``` + +### Code Quality + +```bash +# Format code +black src tests + +# Lint +ruff check src tests + +# Type check +mypy src +``` + +### Pre-commit Hooks + +The project uses pre-commit hooks to ensure code quality: + +- trailing-whitespace: Remove trailing whitespace +- end-of-file-fixer: Ensure files end with a newline +- check-yaml/json/toml: Validate config files +- black: Format Python code +- ruff: Lint Python code +- mypy: Type checking + +## Requirements + +- Python 3.10 or higher +- pyarrow >= 14.0.0 +- pandas >= 2.0.0 +- click >= 8.0.0 +- jinja2 >= 3.0.0 +- s3fs >= 2023.0.0 (optional, for S3 support) + +## License + +MIT License - see [LICENSE](LICENSE) file for details. + +## Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. diff --git a/examples.py b/examples.py new file mode 100644 index 0000000..a08a462 --- /dev/null +++ b/examples.py @@ -0,0 +1,222 @@ +""" +Example usage of tablediff-arrow library. + +This script demonstrates various ways to use the tablediff-arrow package +for comparing tables with different formats and options. +""" + +import tempfile +from pathlib import Path + +import pyarrow as pa +import pyarrow.parquet as pq + +from tablediff_arrow import TableDiff +from tablediff_arrow.reports import generate_csv_report, generate_html_report + + +def example_basic_comparison(): + """Basic table comparison example.""" + print("=" * 60) + print("Example 1: Basic Table Comparison") + print("=" * 60) + + # Create sample tables + left = pa.table( + { + "id": [1, 2, 3, 4], + "name": ["Alice", "Bob", "Charlie", "David"], + "amount": [100.0, 200.0, 300.0, 400.0], + } + ) + + right = pa.table( + { + "id": [1, 2, 3, 5], + "name": ["Alice", "Bob", "Charlie", "Eve"], + "amount": [100.0, 205.0, 300.0, 500.0], + } + ) + + # Compare tables + differ = TableDiff(key_columns=["id"]) + result = differ.compare_tables(left, right) + + # Print summary + print("\n" + result.summary()) + print() + + +def example_with_tolerance(): + """Table comparison with numeric tolerance.""" + print("=" * 60) + print("Example 2: Comparison with Numeric Tolerance") + print("=" * 60) + + left = pa.table({"id": [1, 2, 3], "value": [100.0, 200.0, 300.0]}) + + right = pa.table({"id": [1, 2, 3], "value": [100.01, 200.02, 300.03]}) + + # Without tolerance + print("\nWithout tolerance:") + differ = TableDiff(key_columns=["id"]) + result = differ.compare_tables(left, right) + print(f"Changed rows: {result.changed_rows}") + + # With absolute tolerance + print("\nWith absolute tolerance of 0.05:") + differ = TableDiff(key_columns=["id"], tolerance={"value": 0.05}) + result = differ.compare_tables(left, right) + print(f"Changed rows: {result.changed_rows}") + print() + + +def example_file_comparison(): + """Compare files from disk.""" + print("=" * 60) + print("Example 3: File Comparison") + print("=" * 60) + + with tempfile.TemporaryDirectory() as tmpdir: + tmppath = Path(tmpdir) + + # Create test files + left = pa.table({"id": [1, 2, 3], "value": [10, 20, 30]}) + right = pa.table({"id": [1, 2, 3], "value": [10, 21, 30]}) + + left_path = tmppath / "left.parquet" + right_path = tmppath / "right.parquet" + + pq.write_table(left, left_path) + pq.write_table(right, right_path) + + # Compare files + differ = TableDiff(key_columns=["id"]) + result = differ.compare_files(left_path, right_path) + + print("\n" + result.summary()) + print() + + +def example_html_report(): + """Generate an HTML report.""" + print("=" * 60) + print("Example 4: HTML Report Generation") + print("=" * 60) + + left = pa.table( + { + "id": [1, 2, 3, 4], + "category": ["A", "B", "C", "D"], + "value": [100, 200, 300, 400], + } + ) + + right = pa.table( + { + "id": [1, 2, 3, 5], + "category": ["A", "B", "C", "E"], + "value": [100, 210, 300, 500], + } + ) + + differ = TableDiff(key_columns=["id"]) + result = differ.compare_tables(left, right) + + with tempfile.TemporaryDirectory() as tmpdir: + report_path = Path(tmpdir) / "report.html" + generate_html_report(result, report_path) + print(f"\nHTML report generated at: {report_path}") + print(f"Report size: {report_path.stat().st_size} bytes") + print() + + +def example_csv_reports(): + """Generate CSV reports.""" + print("=" * 60) + print("Example 5: CSV Reports Generation") + print("=" * 60) + + left = pa.table( + { + "year": [2020, 2020, 2021], + "month": [1, 2, 1], + "sales": [1000, 2000, 3000], + } + ) + + right = pa.table( + { + "year": [2020, 2020, 2021], + "month": [1, 2, 1], + "sales": [1000, 2100, 3000], + } + ) + + differ = TableDiff(key_columns=["year", "month"]) + result = differ.compare_tables(left, right) + + with tempfile.TemporaryDirectory() as tmpdir: + generate_csv_report(result, tmpdir, prefix="sales_diff") + print("\nCSV reports generated:") + for csv_file in Path(tmpdir).glob("*.csv"): + print(f" - {csv_file.name} ({csv_file.stat().st_size} bytes)") + print() + + +def example_relative_tolerance(): + """Compare with relative tolerance.""" + print("=" * 60) + print("Example 6: Relative Tolerance") + print("=" * 60) + + left = pa.table( + { + "id": [1, 2], + "small_value": [10.0, 20.0], + "large_value": [1000.0, 2000.0], + } + ) + + right = pa.table( + { + "id": [1, 2], + "small_value": [10.1, 20.1], # 1% difference + "large_value": [1010.0, 2020.0], # 1% difference + } + ) + + # With absolute tolerance + print("\nWith absolute tolerance of 0.5:") + differ = TableDiff(key_columns=["id"], tolerance={"small_value": 0.5, "large_value": 0.5}) + result = differ.compare_tables(left, right) + print(f"Changed rows: {result.changed_rows}") + print(f"Changes: {result.column_changes}") + + # With relative tolerance + print("\nWith relative tolerance of 2% (0.02):") + differ = TableDiff( + key_columns=["id"], + relative_tolerance={"small_value": 0.02, "large_value": 0.02}, + ) + result = differ.compare_tables(left, right) + print(f"Changed rows: {result.changed_rows}") + print(f"Changes: {result.column_changes}") + print() + + +if __name__ == "__main__": + print("\n" + "=" * 60) + print("tablediff-arrow Examples") + print("=" * 60 + "\n") + + example_basic_comparison() + example_with_tolerance() + example_file_comparison() + example_html_report() + example_csv_reports() + example_relative_tolerance() + + print("=" * 60) + print("All examples completed successfully!") + print("=" * 60) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6ded538 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,81 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "tablediff-arrow" +version = "0.1.0" +description = "Fast, file-based diffs for Parquet/CSV/Arrow (local or S3) with keyed comparisons, per-column tolerances, and HTML/CSV reports—built on Apache Arrow." +readme = "README.md" +authors = [ + {name = "Prasenjit Singh", email = "psmman@users.noreply.github.com"} +] +license = {text = "MIT"} +requires-python = ">=3.10" +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", +] +keywords = ["diff", "parquet", "csv", "arrow", "data-comparison"] + +dependencies = [ + "pyarrow>=14.0.0", + "pandas>=2.0.0", + "click>=8.0.0", + "jinja2>=3.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "black>=23.0.0", + "ruff>=0.1.0", + "mypy>=1.0.0", + "pre-commit>=3.0.0", +] +s3 = [ + "s3fs>=2023.0.0", +] + +[project.urls] +Homepage = "https://github.com/psmman/tablediff-arrow" +Repository = "https://github.com/psmman/tablediff-arrow" +Issues = "https://github.com/psmman/tablediff-arrow/issues" + +[project.scripts] +tablediff = "tablediff_arrow.cli:main" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.black] +line-length = 100 +target-version = ["py310", "py311", "py312"] + +[tool.ruff] +line-length = 100 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W", "B", "UP"] +ignore = ["E501"] + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = "test_*.py" +python_classes = "Test*" +python_functions = "test_*" +addopts = "-v --cov=tablediff_arrow --cov-report=term-missing --cov-report=html" diff --git a/src/tablediff_arrow/__init__.py b/src/tablediff_arrow/__init__.py new file mode 100644 index 0000000..3190fd6 --- /dev/null +++ b/src/tablediff_arrow/__init__.py @@ -0,0 +1,9 @@ +""" +tablediff-arrow: Fast, file-based diffs for Parquet/CSV/Arrow data. +""" + +__version__ = "0.1.0" + +from .compare import DiffResult, TableDiff + +__all__ = ["TableDiff", "DiffResult", "__version__"] diff --git a/src/tablediff_arrow/cli.py b/src/tablediff_arrow/cli.py new file mode 100644 index 0000000..f61ad4c --- /dev/null +++ b/src/tablediff_arrow/cli.py @@ -0,0 +1,176 @@ +""" +Command-line interface for tablediff-arrow. +""" + +import sys + +import click + +from .compare import TableDiff +from .reports import generate_csv_report, generate_html_report + + +@click.command() +@click.argument("left", type=click.Path(exists=True)) +@click.argument("right", type=click.Path(exists=True)) +@click.option( + "--key", + "-k", + multiple=True, + required=True, + help="Key column(s) for comparison. Can be specified multiple times.", +) +@click.option( + "--tolerance", + "-t", + multiple=True, + help='Absolute tolerance for numeric columns in format "column:value". ' + "Example: --tolerance amount:0.01", +) +@click.option( + "--relative-tolerance", + "-r", + multiple=True, + help='Relative tolerance for numeric columns in format "column:value". ' + "Example: --relative-tolerance price:0.001", +) +@click.option( + "--left-format", + type=click.Choice(["parquet", "csv", "arrow"], case_sensitive=False), + help="Format of the left file. If not specified, inferred from extension.", +) +@click.option( + "--right-format", + type=click.Choice(["parquet", "csv", "arrow"], case_sensitive=False), + help="Format of the right file. If not specified, inferred from extension.", +) +@click.option( + "--output", + "-o", + help="Output file path for HTML report. If not specified, prints summary to console.", +) +@click.option( + "--csv-output", + type=click.Path(), + help="Output directory for CSV reports. Generates separate files for changes, " + "left-only, and right-only rows.", +) +@click.option("--s3", is_flag=True, help="Enable S3 filesystem support for reading files from S3.") +def main( + left: str, + right: str, + key: tuple, + tolerance: tuple, + relative_tolerance: tuple, + left_format: str | None, + right_format: str | None, + output: str | None, + csv_output: str | None, + s3: bool, +): + """ + Compare two tables and generate diff reports. + + LEFT: Path to the left/source table file (local or s3://) + + RIGHT: Path to the right/target table file (local or s3://) + + Examples: + + # Compare two Parquet files using 'id' as key + tablediff left.parquet right.parquet -k id + + # Compare with numeric tolerance + tablediff left.csv right.csv -k id -t amount:0.01 + + # Generate HTML report + tablediff left.parquet right.parquet -k id -o report.html + + # Compare S3 files + tablediff s3://bucket/left.parquet s3://bucket/right.parquet -k id --s3 + """ + # Parse tolerances + abs_tolerance = {} + for tol in tolerance: + try: + col, val = tol.split(":", 1) + abs_tolerance[col] = float(val) + except ValueError: + click.echo( + f"Error: Invalid tolerance format '{tol}'. Expected 'column:value'", err=True + ) + sys.exit(1) + + rel_tolerance = {} + for tol in relative_tolerance: + try: + col, val = tol.split(":", 1) + rel_tolerance[col] = float(val) + except ValueError: + click.echo( + f"Error: Invalid relative tolerance format '{tol}'. Expected 'column:value'", + err=True, + ) + sys.exit(1) + + # Setup filesystem for S3 + filesystem = None + if s3 or left.startswith("s3://") or right.startswith("s3://"): + try: + import s3fs + + filesystem = s3fs.S3FileSystem() + except ImportError: + click.echo( + "Error: s3fs is required for S3 access. " + "Install with: pip install tablediff-arrow[s3]", + err=True, + ) + sys.exit(1) + + # Create TableDiff instance + differ = TableDiff( + key_columns=list(key), + tolerance=abs_tolerance, + relative_tolerance=rel_tolerance, + ) + + # Perform comparison + try: + click.echo("Loading tables...") + result = differ.compare_files( + left, + right, + left_format=left_format, + right_format=right_format, + filesystem=filesystem, + ) + + # Print summary + click.echo("\n" + result.summary()) + + # Generate HTML report if requested + if output: + click.echo(f"\nGenerating HTML report: {output}") + generate_html_report(result, output) + click.echo(f"HTML report written to {output}") + + # Generate CSV reports if requested + if csv_output: + click.echo(f"\nGenerating CSV reports in: {csv_output}") + generate_csv_report(result, csv_output) + click.echo(f"CSV reports written to {csv_output}") + + # Exit with non-zero code if differences found + if result.has_differences: + sys.exit(1) + else: + sys.exit(0) + + except Exception as e: + click.echo(f"Error: {e}", err=True) + sys.exit(2) + + +if __name__ == "__main__": + main() diff --git a/src/tablediff_arrow/compare.py b/src/tablediff_arrow/compare.py new file mode 100644 index 0000000..eb5ed14 --- /dev/null +++ b/src/tablediff_arrow/compare.py @@ -0,0 +1,213 @@ +""" +Core comparison logic for table diffs with keyed comparisons and numeric tolerances. +""" + +from dataclasses import dataclass, field +from pathlib import Path + +import pandas as pd +import pyarrow as pa + +from .loader import load_table + + +@dataclass +class DiffResult: + """Result of a table comparison.""" + + matched_rows: int = 0 + left_only_rows: int = 0 + right_only_rows: int = 0 + changed_rows: int = 0 + total_changes: int = 0 + column_changes: dict[str, int] = field(default_factory=dict) + differences: pa.Table | None = None + left_only: pa.Table | None = None + right_only: pa.Table | None = None + + @property + def has_differences(self) -> bool: + """Check if there are any differences.""" + return self.left_only_rows > 0 or self.right_only_rows > 0 or self.changed_rows > 0 + + def summary(self) -> str: + """Return a summary string of the comparison.""" + lines = [ + "Table Comparison Summary:", + f" Matched rows: {self.matched_rows}", + f" Changed rows: {self.changed_rows}", + f" Left-only rows: {self.left_only_rows}", + f" Right-only rows: {self.right_only_rows}", + f" Total changes: {self.total_changes}", + ] + if self.column_changes: + lines.append(" Changes by column:") + for col, count in sorted(self.column_changes.items()): + lines.append(f" {col}: {count}") + return "\n".join(lines) + + +class TableDiff: + """ + Compare two tables with keyed comparisons and numeric tolerances. + """ + + def __init__( + self, + key_columns: list[str], + tolerance: dict[str, float] | None = None, + relative_tolerance: dict[str, float] | None = None, + ): + """ + Initialize TableDiff. + + Args: + key_columns: List of column names to use as the join key + tolerance: Absolute tolerance per column for numeric comparisons + relative_tolerance: Relative tolerance per column for numeric comparisons + """ + self.key_columns = key_columns + self.tolerance = tolerance or {} + self.relative_tolerance = relative_tolerance or {} + + def compare_files( + self, + left_path: str | Path, + right_path: str | Path, + left_format: str | None = None, + right_format: str | None = None, + filesystem=None, + ) -> DiffResult: + """ + Compare two files. + + Args: + left_path: Path to the left/source file + right_path: Path to the right/target file + left_format: Format of left file (inferred if None) + right_format: Format of right file (inferred if None) + filesystem: Optional filesystem for S3 access + + Returns: + DiffResult object + """ + left_table = load_table(left_path, left_format, filesystem) + right_table = load_table(right_path, right_format, filesystem) + return self.compare_tables(left_table, right_table) + + def compare_tables( + self, + left: pa.Table, + right: pa.Table, + ) -> DiffResult: + """ + Compare two PyArrow tables. + + Args: + left: Left/source table + right: Right/target table + + Returns: + DiffResult object + """ + # Convert to pandas for easier comparison + left_df = left.to_pandas() + right_df = right.to_pandas() + + # Validate key columns exist + for col in self.key_columns: + if col not in left_df.columns: + raise ValueError(f"Key column '{col}' not found in left table") + if col not in right_df.columns: + raise ValueError(f"Key column '{col}' not found in right table") + + # Set index on key columns for comparison + left_df = left_df.set_index(self.key_columns) + right_df = right_df.set_index(self.key_columns) + + # Find rows only in left, only in right, and in both + left_only_idx = left_df.index.difference(right_df.index) + right_only_idx = right_df.index.difference(left_df.index) + common_idx = left_df.index.intersection(right_df.index) + + result = DiffResult() + result.left_only_rows = len(left_only_idx) + result.right_only_rows = len(right_only_idx) + + # Store left-only and right-only rows + if len(left_only_idx) > 0: + result.left_only = pa.Table.from_pandas(left_df.loc[left_only_idx].reset_index()) + if len(right_only_idx) > 0: + result.right_only = pa.Table.from_pandas(right_df.loc[right_only_idx].reset_index()) + + # Compare common rows + if len(common_idx) > 0: + left_common = left_df.loc[common_idx] + right_common = right_df.loc[common_idx] + + # Compare each column + diff_rows = [] + column_changes = {} + + for col in left_common.columns: + if col not in right_common.columns: + continue + + left_col = left_common[col] + right_col = right_common[col] + + # Apply tolerance for numeric columns + if pd.api.types.is_numeric_dtype(left_col): + abs_tol = self.tolerance.get(col, 0.0) + rel_tol = self.relative_tolerance.get(col, 0.0) + + if rel_tol > 0: + # Relative tolerance + diff_mask = ~pd.isna(left_col) & ~pd.isna(right_col) + if diff_mask.any(): + rel_diff = (left_col - right_col).abs() / right_col.abs() + diff_mask = diff_mask & (rel_diff > rel_tol) + elif abs_tol > 0: + # Absolute tolerance + diff_mask = (left_col - right_col).abs() > abs_tol + else: + # No tolerance + diff_mask = left_col != right_col + # Handle NaN comparisons + diff_mask = diff_mask & ~(pd.isna(left_col) & pd.isna(right_col)) + else: + # Non-numeric comparison + diff_mask = left_col != right_col + # Handle NaN/None comparisons + diff_mask = diff_mask & ~(pd.isna(left_col) & pd.isna(right_col)) + + num_changes = diff_mask.sum() + if num_changes > 0: + column_changes[col] = int(num_changes) + + # Record differences + for idx in diff_mask[diff_mask].index: + diff_rows.append( + { + **{ + k: idx[i] if isinstance(idx, tuple) else idx + for i, k in enumerate(self.key_columns) + }, + "column": col, + "left_value": left_col.loc[idx], + "right_value": right_col.loc[idx], + } + ) + + result.column_changes = column_changes + result.total_changes = sum(column_changes.values()) + result.changed_rows = len(set(tuple(d[k] for k in self.key_columns) for d in diff_rows)) + result.matched_rows = len(common_idx) - result.changed_rows + + # Create differences table + if diff_rows: + result.differences = pa.Table.from_pandas(pd.DataFrame(diff_rows)) + else: + result.matched_rows = 0 + + return result diff --git a/src/tablediff_arrow/loader.py b/src/tablediff_arrow/loader.py new file mode 100644 index 0000000..be5e861 --- /dev/null +++ b/src/tablediff_arrow/loader.py @@ -0,0 +1,67 @@ +""" +Data loading module for reading Parquet, CSV, and Arrow files from local or S3. +""" + +from pathlib import Path + +import pyarrow as pa +import pyarrow.csv as csv +import pyarrow.parquet as pq + + +def load_table(path: str | Path, file_format: str | None = None, filesystem=None) -> pa.Table: + """ + Load a table from a file path (local or S3). + + Args: + path: Path to the file (local path or S3 URI like s3://bucket/key) + file_format: File format ('parquet', 'csv', 'arrow'). If None, inferred from extension. + filesystem: Optional filesystem object (e.g., s3fs for S3 access) + + Returns: + PyArrow Table + """ + path_str = str(path) + + # Infer format from extension if not provided + if file_format is None: + if path_str.endswith(".parquet") or path_str.endswith(".pq"): + file_format = "parquet" + elif path_str.endswith(".csv"): + file_format = "csv" + elif path_str.endswith(".arrow") or path_str.endswith(".feather"): + file_format = "arrow" + else: + raise ValueError(f"Cannot infer file format from path: {path_str}") + + # Handle S3 paths + if path_str.startswith("s3://"): + if filesystem is None: + try: + import s3fs + + filesystem = s3fs.S3FileSystem() + except ImportError as err: + raise ImportError( + "s3fs is required for S3 access. Install with: pip install tablediff-arrow[s3]" + ) from err + + # Load based on format + if file_format == "parquet": + return pq.read_table(path_str, filesystem=filesystem) + elif file_format == "csv": + if filesystem: + with filesystem.open(path_str, "rb") as f: + return csv.read_csv(f) + else: + return csv.read_csv(path_str) + elif file_format == "arrow": + if filesystem: + with filesystem.open(path_str, "rb") as f: + with pa.ipc.open_file(f) as reader: + return reader.read_all() + else: + with pa.ipc.open_file(path_str) as reader: + return reader.read_all() + else: + raise ValueError(f"Unsupported file format: {file_format}") diff --git a/src/tablediff_arrow/reports.py b/src/tablediff_arrow/reports.py new file mode 100644 index 0000000..73eaf06 --- /dev/null +++ b/src/tablediff_arrow/reports.py @@ -0,0 +1,275 @@ +""" +Report generation for table diffs (HTML and CSV formats). +""" + +from pathlib import Path + +import pandas as pd +from jinja2 import Template + +from .compare import DiffResult + +HTML_TEMPLATE = """ + + +
+ +