From c62c9b65fd262d025179d03137c4017b2ebc0966 Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Wed, 19 Nov 2025 11:21:20 +0100 Subject: [PATCH 1/4] Add benchmark and workflow (based on observ's) --- .github/workflows/benchmark.yml | 67 +++++++++++++ .gitignore | 1 + benchmarks/benchmark.py | 166 ++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 4 files changed, 235 insertions(+) create mode 100644 .github/workflows/benchmark.yml create mode 100644 benchmarks/benchmark.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..aadb053 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,67 @@ +name: Benchmarks + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + benchmark: + name: Benchmarks + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + + - name: Install uv + uses: astral-sh/setup-uv@v6 + + - name: Set up Python 3.14 + uses: actions/setup-python@v5 + with: + python-version: '3.14' + + - name: Install dependencies + run: uv sync + + # Restore benchmark baseline (read-only for PRs) + - name: Restore benchmark baseline + uses: actions/cache/restore@v4 + with: + path: .benchmarks + key: benchmark-baseline-3.14-${{ runner.os }} + + # On master: save baseline results + - name: Run benchmarks and save baseline + if: github.ref == 'refs/heads/master' + run: | + uv run --no-sync pytest benchmarks/benchmark.py \ + --benchmark-only \ + --benchmark-autosave \ + --benchmark-sort=name + + # On master: cache the new baseline results + - name: Save benchmark baseline + if: github.ref == 'refs/heads/master' + uses: actions/cache/save@v4 + with: + path: .benchmarks + key: benchmark-baseline-3.14-${{ runner.os }} + + # On PRs: compare against baseline and fail if degraded + - name: Run benchmarks and compare + if: github.event_name == 'pull_request' + run: | + if [ -z "$(uv run --no-sync pytest-benchmark list)" ]; then + echo "No baseline found, skip the benchmark" + exit + fi + + uv run --no-sync pytest benchmarks/benchmark.py \ + --benchmark-only \ + --benchmark-compare \ + --benchmark-compare-fail=mean:5% \ + --benchmark-sort=name + diff --git a/.gitignore b/.gitignore index e674c48..c9dfbba 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__ .coverage dist uv.lock +.benchmarks/ diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py new file mode 100644 index 0000000..2cca2e3 --- /dev/null +++ b/benchmarks/benchmark.py @@ -0,0 +1,166 @@ +""" +Benchmark suite for patchdiff performance testing using pytest-benchmark. + +Run benchmarks: + uv run pytest benchmarks/benchmark.py --benchmark-only + +Save baseline: + uv run pytest benchmarks/benchmark.py --benchmark-only --benchmark-autosave + +Compare against baseline: + uv run pytest benchmarks/benchmark.py --benchmark-only --benchmark-compare=0001 + +Fail if performance degrades >5%: + uv run pytest benchmarks/benchmark.py --benchmark-only --benchmark-compare=0001 --benchmark-compare-fail=mean:5% +""" + +import random + +import pytest + +from patchdiff import apply, diff + +# Set seed for reproducibility +random.seed(42) + + +def generate_random_list(size: int, value_range: int = 1000) -> list[int]: + """Generate a random list of integers.""" + return [random.randint(0, value_range) for _ in range(size)] + + +def generate_similar_lists( + size: int, change_ratio: float = 0.1 +) -> tuple[list[int], list[int]]: + """ + Generate two similar lists with specified change ratio. + + Args: + size: Size of the lists + change_ratio: Ratio of elements that differ (0.0 to 1.0) + """ + list_a = generate_random_list(size) + list_b = list_a.copy() + + num_changes = int(size * change_ratio) + + # Make some replacements + for _ in range(num_changes // 3): + idx = random.randint(0, size - 1) + list_b[idx] = random.randint(0, 1000) + + # Make some insertions + for _ in range(num_changes // 3): + idx = random.randint(0, len(list_b)) + list_b.insert(idx, random.randint(0, 1000)) + + # Make some deletions + for _ in range(num_changes // 3): + if list_b: + idx = random.randint(0, len(list_b) - 1) + del list_b[idx] + + return list_a, list_b + + +def generate_nested_dict(depth: int, breadth: int) -> dict | int: + """Generate a nested dictionary structure.""" + if depth == 0: + return random.randint(0, 1000) + + result = {} + for i in range(breadth): + key = f"key_{i}" + if random.random() > 0.3: + result[key] = generate_nested_dict(depth - 1, breadth) + else: + result[key] = random.randint(0, 1000) + return result + + +# ======================================== +# List Diff Benchmarks +# ======================================== + + +@pytest.mark.benchmark(group="list-diff") +def test_list_diff_small_10pct(benchmark): + """Benchmark: 50 element list with 10% changes.""" + a, b = generate_similar_lists(50, 0.1) + benchmark(diff, a, b) + + +@pytest.mark.benchmark(group="list-diff") +@pytest.mark.parametrize("change_ratio", [0.05, 0.1, 0.5]) +def test_list_diff_medium(benchmark, change_ratio): + """Benchmark: 1000 element list with varying change ratios.""" + a, b = generate_similar_lists(1000, change_ratio) + benchmark(diff, a, b) + + +@pytest.mark.benchmark(group="list-diff-edge") +def test_list_diff_completely_different(benchmark): + """Benchmark: Two completely different 1000 element lists.""" + a = generate_random_list(1000) + b = generate_random_list(1000) + benchmark(diff, a, b) + + +@pytest.mark.benchmark(group="list-diff-edge") +def test_list_diff_identical(benchmark): + """Benchmark: Two identical 10000 element lists.""" + a = generate_random_list(10000) + b = a.copy() + benchmark(diff, a, b) + + +# ======================================== +# Dict Diff Benchmarks +# ======================================== + + +@pytest.mark.benchmark(group="dict-diff") +def test_dict_diff_flat_500_keys(benchmark): + """Benchmark: Flat dict with 500 keys, 10% changed.""" + a = {f"key_{i}": i for i in range(500)} + b = a.copy() + # Change 10% + for i in range(50): + b[f"key_{i}"] = i + 500 + + benchmark(diff, a, b) + + +@pytest.mark.benchmark(group="dict-diff") +def test_dict_diff_nested(benchmark): + """Benchmark: Nested dict with depth=3, breadth=5.""" + a = generate_nested_dict(3, 5) + b = generate_nested_dict(3, 5) + benchmark(diff, a, b) + + +# ======================================== +# Mixed Structure Benchmarks +# ======================================== + + +@pytest.mark.benchmark(group="mixed") +def test_mixed_dict_with_list_values(benchmark): + """Benchmark: Dict with 50 keys, each containing a 100-element list.""" + a = {f"key_{i}": generate_random_list(100) for i in range(50)} + b = {f"key_{i}": generate_random_list(100) for i in range(50)} + benchmark(diff, a, b) + + +# ======================================== +# Apply Benchmarks +# ======================================== + + +@pytest.mark.benchmark(group="apply") +def test_apply_list_1000_elements(benchmark): + """Benchmark: Apply patch to 1000 element list with 10% changes.""" + a, b = generate_similar_lists(1000, 0.1) + ops, _ = diff(a, b) + + benchmark(apply, a, ops) diff --git a/pyproject.toml b/pyproject.toml index 8dacd31..cd35aba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dev = [ "pytest", "pytest-cov", "pytest-watch", + "pytest-benchmark", ] [tool.ruff.lint] From 171af177131df0ed1f86eac1c8884f6a9d7ac2ea Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Wed, 19 Nov 2025 11:23:56 +0100 Subject: [PATCH 2/4] Implement iterative levenshtein distance --- patchdiff/diff.py | 192 ++++++++++++++++++++++++++++------------------ 1 file changed, 118 insertions(+), 74 deletions(-) diff --git a/patchdiff/diff.py b/patchdiff/diff.py index d56fa94..98ec31f 100644 --- a/patchdiff/diff.py +++ b/patchdiff/diff.py @@ -1,4 +1,3 @@ -from functools import partial, reduce from typing import Dict, List, Set, Tuple from .pointer import Pointer @@ -6,86 +5,129 @@ def diff_lists(input: List, output: List, ptr: Pointer) -> Tuple[List, List]: - memory = {(0, 0): {"ops": [], "rops": [], "cost": 0}} + m, n = len(input), len(output) - def dist(i, j): - if (i, j) not in memory: - if i > 0 and j > 0 and input[i - 1] == output[j - 1]: - step = dist(i - 1, j - 1) + # Build DP table bottom-up (iterative approach) + # dp[i][j] = cost of transforming input[0:i] to output[0:j] + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # Initialize base cases + for i in range(1, m + 1): + dp[i][0] = i # Cost of deleting all elements + for j in range(1, n + 1): + dp[0][j] = j # Cost of adding all elements + + # Fill DP table + for i in range(1, m + 1): + for j in range(1, n + 1): + if input[i - 1] == output[j - 1]: + # Elements match, no operation needed + dp[i][j] = dp[i - 1][j - 1] else: - paths = [] - if i > 0: - base = dist(i - 1, j) - op = {"op": "remove", "idx": i - 1} - rop = {"op": "add", "idx": j - 1, "value": input[i - 1]} - paths.append( - { - "ops": base["ops"] + [op], - "rops": base["rops"] + [rop], - "cost": base["cost"] + 1, - } - ) - if j > 0: - base = dist(i, j - 1) - op = {"op": "add", "idx": i - 1, "value": output[j - 1]} - rop = {"op": "remove", "idx": j - 1} - paths.append( - { - "ops": base["ops"] + [op], - "rops": base["rops"] + [rop], - "cost": base["cost"] + 1, - } - ) - if i > 0 and j > 0: - base = dist(i - 1, j - 1) - op = { - "op": "replace", - "idx": i - 1, - "original": input[i - 1], - "value": output[j - 1], - } - rop = { - "op": "replace", - "idx": j - 1, - "original": output[j - 1], - "value": input[i - 1], - } - paths.append( - { - "ops": base["ops"] + [op], - "rops": base["rops"] + [rop], - "cost": base["cost"] + 1, - } - ) - step = min(paths, key=lambda a: a["cost"]) - memory[(i, j)] = step - return memory[(i, j)] - - def pad(state, op, target=None): - ops, padding = state + # Take minimum of three operations + dp[i][j] = min( + dp[i - 1][j] + 1, # Remove from input + dp[i][j - 1] + 1, # Add from output + dp[i - 1][j - 1] + 1, # Replace + ) + + # Traceback to extract operations + ops = [] + rops = [] + i, j = m, n + + while i > 0 or j > 0: + if i > 0 and j > 0 and input[i - 1] == output[j - 1]: + # Elements match, no operation + i -= 1 + j -= 1 + elif i > 0 and (j == 0 or dp[i][j] == dp[i - 1][j] + 1): + # Remove from input + ops.append({"op": "remove", "idx": i - 1}) + rops.append({"op": "add", "idx": j - 1, "value": input[i - 1]}) + i -= 1 + elif j > 0 and (i == 0 or dp[i][j] == dp[i][j - 1] + 1): + # Add from output + ops.append({"op": "add", "idx": i - 1, "value": output[j - 1]}) + rops.append({"op": "remove", "idx": j - 1}) + j -= 1 + else: + # Replace + ops.append( + { + "op": "replace", + "idx": i - 1, + "original": input[i - 1], + "value": output[j - 1], + } + ) + rops.append( + { + "op": "replace", + "idx": j - 1, + "original": output[j - 1], + "value": input[i - 1], + } + ) + i -= 1 + j -= 1 + + # Apply padding to operations (using explicit loops instead of reduce) + padded_ops = [] + padding = 0 + # Iterate in reverse to get correct order (traceback extracts operations backwards) + for op in reversed(ops): if op["op"] == "add": padded_idx = op["idx"] + 1 + padding - idx_token = padded_idx if padded_idx < len(target) + padding else "-" - full_op = { - "op": "add", - "path": ptr.append(idx_token), - "value": op["value"], - } - return [[*ops, full_op], padding + 1] + idx_token = padded_idx if padded_idx < len(input) + padding else "-" + padded_ops.append( + { + "op": "add", + "path": ptr.append(idx_token), + "value": op["value"], + } + ) + padding += 1 elif op["op"] == "remove": - full_op = { - "op": "remove", - "path": ptr.append(op["idx"] + padding), - } - return [[*ops, full_op], padding - 1] - else: + padded_ops.append( + { + "op": "remove", + "path": ptr.append(op["idx"] + padding), + } + ) + padding -= 1 + else: # replace replace_ptr = ptr.append(op["idx"] + padding) replace_ops, _ = diff(op["original"], op["value"], replace_ptr) - return [ops + replace_ops, padding] + padded_ops.extend(replace_ops) - solution = dist(len(input), len(output)) - padded_ops, _ = reduce(partial(pad, target=input), solution["ops"], [[], 0]) - padded_rops, _ = reduce(partial(pad, target=output), solution["rops"], [[], 0]) + padded_rops = [] + padding = 0 + # Iterate in reverse to get correct order (traceback extracts operations backwards) + for op in reversed(rops): + if op["op"] == "add": + padded_idx = op["idx"] + 1 + padding + idx_token = padded_idx if padded_idx < len(output) + padding else "-" + padded_rops.append( + { + "op": "add", + "path": ptr.append(idx_token), + "value": op["value"], + } + ) + padding += 1 + elif op["op"] == "remove": + padded_rops.append( + { + "op": "remove", + "path": ptr.append(op["idx"] + padding), + } + ) + padding -= 1 + else: # replace + replace_ptr = ptr.append(op["idx"] + padding) + replace_ops, _ = diff(op["original"], op["value"], replace_ptr) + padded_rops.extend(replace_ops) return padded_ops, padded_rops @@ -125,7 +167,9 @@ def diff_sets(input: Set, output: Set, ptr: Pointer) -> Tuple[List, List]: return ops, rops -def diff(input: Diffable, output: Diffable, ptr: Pointer = None) -> Tuple[List, List]: +def diff( + input: Diffable, output: Diffable, ptr: Pointer | None = None +) -> Tuple[List, List]: if input == output: return [], [] if ptr is None: From af1f2be8000e2bcc211dcbb2b0546c5bc051eb64 Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Wed, 19 Nov 2025 11:24:32 +0100 Subject: [PATCH 3/4] Fix a small typing issue --- patchdiff/pointer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/patchdiff/pointer.py b/patchdiff/pointer.py index 279f740..b1a1b31 100644 --- a/patchdiff/pointer.py +++ b/patchdiff/pointer.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from typing import Any, Hashable, List, Tuple +from typing import Any, Hashable, Iterable, Tuple from .types import Diffable @@ -20,7 +20,7 @@ def escape(token: str) -> str: class Pointer: - def __init__(self, tokens: List[Hashable] | None = None) -> None: + def __init__(self, tokens: Iterable[Hashable] | None = None) -> None: if tokens is None: tokens = [] self.tokens = tuple(tokens) From da252843b5dc7cffebfe179bdd7c8095d9d9d85a Mon Sep 17 00:00:00 2001 From: Berend Klein Haneveld Date: Wed, 19 Nov 2025 11:34:53 +0100 Subject: [PATCH 4/4] Make sure it works on older Pythons --- patchdiff/diff.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/patchdiff/diff.py b/patchdiff/diff.py index 98ec31f..10185db 100644 --- a/patchdiff/diff.py +++ b/patchdiff/diff.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Dict, List, Set, Tuple from .pointer import Pointer