diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..97623ac
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,68 @@
+name: Benchmarks
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  benchmark:
+    name: Benchmarks
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+
+      - name: Set up Python 3.14
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.14'
+
+      - name: Install dependencies
+        run: uv sync
+
+      # Restore benchmark baseline (read-only for PRs)
+      - name: Restore benchmark baseline
+        uses: actions/cache/restore@v4
+        with:
+          path: .benchmarks
+          key: benchmark-baseline-3.14-${{ runner.os }}
+
+      # On master: save baseline results
+      - name: Run benchmarks and save baseline
+        if: github.ref == 'refs/heads/master'
+        run: |
+          uv run --no-sync pytest benchmarks/benchmark.py \
+            --benchmark-only \
+            --benchmark-autosave \
+            --benchmark-sort=name
+
+      # On master: cache the new baseline results
+      - name: Save benchmark baseline
+        if: github.ref == 'refs/heads/master'
+        uses: actions/cache/save@v4
+        with:
+          path: .benchmarks
+          key: benchmark-baseline-3.14-${{ runner.os }}
+
+      # On PRs: compare against baseline and fail if degraded
+      - name: Run benchmarks and compare
+        if: github.event_name == 'pull_request'
+        run: |
+          if [ -z "$(uv run --no-sync pytest-benchmark list)" ]; then
+            echo "No baseline found, not comparing"
+            uv run --no-sync pytest -v benchmarks/benchmark.py
+            exit
+          fi
+
+          uv run --no-sync pytest benchmarks/benchmark.py \
+              --benchmark-only \
+              --benchmark-compare \
+              --benchmark-compare-fail=mean:5% \
+              --benchmark-sort=name
+
diff --git a/.gitignore b/.gitignore
index e674c48..c9dfbba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ __pycache__
 .coverage
 dist
 uv.lock
+.benchmarks/
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
new file mode 100644
index 0000000..2cca2e3
--- /dev/null
+++ b/benchmarks/benchmark.py
@@ -0,0 +1,166 @@
+"""
+Benchmark suite for patchdiff performance testing using pytest-benchmark.
+
+Run benchmarks:
+    uv run pytest benchmarks/benchmark.py --benchmark-only
+
+Save baseline:
+    uv run pytest benchmarks/benchmark.py --benchmark-only --benchmark-autosave
+
+Compare against baseline:
+    uv run pytest benchmarks/benchmark.py --benchmark-only --benchmark-compare=0001
+
+Fail if performance degrades >5%:
+    uv run pytest benchmarks/benchmark.py --benchmark-only --benchmark-compare=0001 --benchmark-compare-fail=mean:5%
+"""
+
+import random
+
+import pytest
+
+from patchdiff import apply, diff
+
+# Set seed for reproducibility
+random.seed(42)
+
+
+def generate_random_list(size: int, value_range: int = 1000) -> list[int]:
+    """Generate a random list of integers."""
+    return [random.randint(0, value_range) for _ in range(size)]
+
+
+def generate_similar_lists(
+    size: int, change_ratio: float = 0.1
+) -> tuple[list[int], list[int]]:
+    """
+    Generate two similar lists with specified change ratio.
+
+    Args:
+        size: Size of the lists
+        change_ratio: Ratio of elements that differ (0.0 to 1.0)
+    """
+    list_a = generate_random_list(size)
+    list_b = list_a.copy()
+
+    num_changes = int(size * change_ratio)
+
+    # Make some replacements
+    for _ in range(num_changes // 3):
+        idx = random.randint(0, size - 1)
+        list_b[idx] = random.randint(0, 1000)
+
+    # Make some insertions
+    for _ in range(num_changes // 3):
+        idx = random.randint(0, len(list_b))
+        list_b.insert(idx, random.randint(0, 1000))
+
+    # Make some deletions
+    for _ in range(num_changes // 3):
+        if list_b:
+            idx = random.randint(0, len(list_b) - 1)
+            del list_b[idx]
+
+    return list_a, list_b
+
+
+def generate_nested_dict(depth: int, breadth: int) -> dict | int:
+    """Generate a nested dictionary structure."""
+    if depth == 0:
+        return random.randint(0, 1000)
+
+    result = {}
+    for i in range(breadth):
+        key = f"key_{i}"
+        if random.random() > 0.3:
+            result[key] = generate_nested_dict(depth - 1, breadth)
+        else:
+            result[key] = random.randint(0, 1000)
+    return result
+
+
+# ========================================
+# List Diff Benchmarks
+# ========================================
+
+
+@pytest.mark.benchmark(group="list-diff")
+def test_list_diff_small_10pct(benchmark):
+    """Benchmark: 50 element list with 10% changes."""
+    a, b = generate_similar_lists(50, 0.1)
+    benchmark(diff, a, b)
+
+
+@pytest.mark.benchmark(group="list-diff")
+@pytest.mark.parametrize("change_ratio", [0.05, 0.1, 0.5])
+def test_list_diff_medium(benchmark, change_ratio):
+    """Benchmark: 1000 element list with varying change ratios."""
+    a, b = generate_similar_lists(1000, change_ratio)
+    benchmark(diff, a, b)
+
+
+@pytest.mark.benchmark(group="list-diff-edge")
+def test_list_diff_completely_different(benchmark):
+    """Benchmark: Two completely different 1000 element lists."""
+    a = generate_random_list(1000)
+    b = generate_random_list(1000)
+    benchmark(diff, a, b)
+
+
+@pytest.mark.benchmark(group="list-diff-edge")
+def test_list_diff_identical(benchmark):
+    """Benchmark: Two identical 10000 element lists."""
+    a = generate_random_list(10000)
+    b = a.copy()
+    benchmark(diff, a, b)
+
+
+# ========================================
+# Dict Diff Benchmarks
+# ========================================
+
+
+@pytest.mark.benchmark(group="dict-diff")
+def test_dict_diff_flat_500_keys(benchmark):
+    """Benchmark: Flat dict with 500 keys, 10% changed."""
+    a = {f"key_{i}": i for i in range(500)}
+    b = a.copy()
+    # Change 10%
+    for i in range(50):
+        b[f"key_{i}"] = i + 500
+
+    benchmark(diff, a, b)
+
+
+@pytest.mark.benchmark(group="dict-diff")
+def test_dict_diff_nested(benchmark):
+    """Benchmark: Nested dict with depth=3, breadth=5."""
+    a = generate_nested_dict(3, 5)
+    b = generate_nested_dict(3, 5)
+    benchmark(diff, a, b)
+
+
+# ========================================
+# Mixed Structure Benchmarks
+# ========================================
+
+
+@pytest.mark.benchmark(group="mixed")
+def test_mixed_dict_with_list_values(benchmark):
+    """Benchmark: Dict with 50 keys, each containing a 100-element list."""
+    a = {f"key_{i}": generate_random_list(100) for i in range(50)}
+    b = {f"key_{i}": generate_random_list(100) for i in range(50)}
+    benchmark(diff, a, b)
+
+
+# ========================================
+# Apply Benchmarks
+# ========================================
+
+
+@pytest.mark.benchmark(group="apply")
+def test_apply_list_1000_elements(benchmark):
+    """Benchmark: Apply patch to 1000 element list with 10% changes."""
+    a, b = generate_similar_lists(1000, 0.1)
+    ops, _ = diff(a, b)
+
+    benchmark(apply, a, ops)
diff --git a/pyproject.toml b/pyproject.toml
index 8dacd31..cd35aba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ dev = [
     "pytest",
     "pytest-cov",
     "pytest-watch",
+    "pytest-benchmark",
 ]
 
 [tool.ruff.lint]