Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 85 additions & 7 deletions src/kit/code_searcher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

import logging
import os
import re
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -34,17 +36,93 @@ def __init__(self, repo_path: str) -> None:
self.repo_path: Path = Path(repo_path)
self._gitignore_spec = self._load_gitignore() # Load gitignore spec

def _adjust_gitignore_pattern(self, pattern: str, rel_base: Path) -> str:
"""Adjust a gitignore pattern to be relative to the repository root.

Args:
pattern: The pattern from a .gitignore file (already stripped, negation removed)
rel_base: Relative path from repo root to the .gitignore directory

Returns:
The adjusted pattern prefixed with the correct path
"""
if str(rel_base) == ".":
# Pattern is in root .gitignore - use as-is
return pattern

# Pattern is in subdirectory
if pattern.startswith("/"):
# Absolute pattern (relative to gitignore dir) - make relative to repo root
return f"{rel_base}/{pattern[1:]}"
else:
# Relative pattern - applies to directory and all subdirectories
# Use /** to match files at any depth under the directory
return f"{rel_base}/**/{pattern}"
Comment on lines +54 to +60
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same pattern adjustment logic error as in repo_mapper.py - patterns with internal forward slashes are incorrectly treated as matching at any depth.

View Details
📝 Patch Details
diff --git a/src/kit/code_searcher.py b/src/kit/code_searcher.py
index 0c7e5f2..eb80bef 100644
--- a/src/kit/code_searcher.py
+++ b/src/kit/code_searcher.py
@@ -54,9 +54,11 @@ class CodeSearcher:
         if pattern.startswith("/"):
             # Absolute pattern (relative to gitignore dir) - make relative to repo root
             return f"{rel_base}/{pattern[1:]}"
+        elif "/" in pattern:
+            # Pattern with internal slash - keep relative to gitignore dir
+            return f"{rel_base}/{pattern}"
         else:
-            # Relative pattern - applies to directory and all subdirectories
-            # Use /** to match files at any depth under the directory
+            # Pattern without slash - applies to any depth
             return f"{rel_base}/**/{pattern}"
 
     def _load_gitignore(self):
diff --git a/src/kit/repo_mapper.py b/src/kit/repo_mapper.py
index 62ae31d..c3c981a 100644
--- a/src/kit/repo_mapper.py
+++ b/src/kit/repo_mapper.py
@@ -41,9 +41,11 @@ class RepoMapper:
         if pattern.startswith("/"):
             # Absolute pattern (relative to gitignore dir) - make relative to repo root
             return f"{rel_base}/{pattern[1:]}"
+        elif "/" in pattern:
+            # Pattern with internal slash - keep relative to gitignore dir
+            return f"{rel_base}/{pattern}"
         else:
-            # Relative pattern - applies to directory and all subdirectories
-            # Use /** to match files at any depth under the directory
+            # Pattern without slash - applies to any depth
             return f"{rel_base}/**/{pattern}"
 
     def _load_gitignore(self):

Analysis

Gitignore pattern adjustment logic error - patterns with internal slashes incorrectly treated as recursive

What fails: The _adjust_gitignore_pattern() method in both CodeSearcher (line 54-60) and RepoMapper (line 41-48) incorrectly adds recursive **/ matching to all patterns without a slash, including patterns that contain internal slashes. This causes patterns like src/config.js from a subdirectory .gitignore to match files at ANY depth under that directory, rather than only at the exact relative path.

How to reproduce:

  1. Create a repository structure:

    subdir/
    ├── .gitignore (contains: src/config.js)
    ├── src/
    │   └── config.js
    └── anything/
        └── src/
            └── config.js
    
  2. Run CodeSearcher or RepoMapper on this repository

  3. Check which files are ignored via _should_ignore() method

Result: Both subdir/src/config.js and subdir/anything/src/config.js are marked as ignored

Expected: Only subdir/src/config.js should be ignored. The pattern src/config.js in subdir/.gitignore means "match the file at the path src/config.js relative to the subdirectory", not "match src/config.js at any depth". This matches Git's actual behavior per gitignore documentation - a pattern containing a slash is treated as a glob pattern relative to the directory containing the .gitignore file.

Fix: The _adjust_gitignore_pattern() method now checks for internal slashes before adding **/:

  • Patterns starting with / are anchored to the gitignore directory
  • Patterns containing internal / are matched exactly relative to the gitignore directory
  • Only patterns without any slashes use **/ to match at any depth


def _load_gitignore(self):
"""Loads .gitignore rules from the repository root."""
gitignore_path = self.repo_path / ".gitignore"
if gitignore_path.exists():
"""Load all .gitignore files in repository tree and merge them.

Returns a PathSpec that respects all .gitignore files, with proper
precedence (patterns from deeper directories can override root patterns).
"""
gitignore_files = []

# Collect all .gitignore files
for dirpath, dirnames, filenames in os.walk(self.repo_path):
if ".git" in Path(dirpath).parts:
continue
if ".gitignore" in filenames:
gitignore_files.append(Path(dirpath) / ".gitignore")

if not gitignore_files:
return None

# Sort by depth (shallowest first) for correct precedence
# Git processes .gitignore files from root to leaf, so later patterns can override earlier ones
gitignore_files.sort(key=lambda p: len(p.parts))

# Collect all patterns with proper path prefixes
all_patterns = []
for gitignore_path in gitignore_files:
try:
with open(gitignore_path, "r", encoding="utf-8") as f:
return pathspec.PathSpec.from_lines("gitwildmatch", f)
patterns = f.readlines()

# Calculate relative base path from repo root
try:
rel_base = gitignore_path.parent.relative_to(self.repo_path)
except ValueError:
continue # gitignore outside repo (shouldn't happen)

# Process each pattern
for pattern in patterns:
pattern = pattern.strip()
if not pattern or pattern.startswith("#"):
continue

# Handle negation patterns
is_negation = pattern.startswith("!")
if is_negation:
pattern = pattern[1:]

# Adjust pattern to be relative to repo root
adjusted = self._adjust_gitignore_pattern(pattern, rel_base)

# Re-add negation prefix if needed
if is_negation:
adjusted = f"!{adjusted}"

all_patterns.append(adjusted)

except Exception as e:
# Log this error if logging is set up, or print
print(f"Warning: Could not load .gitignore: {e}")
return None
logging.warning(f"Could not load {gitignore_path}: {e}")
continue

if not all_patterns:
return None

# Create single merged pathspec
return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns)

def _should_ignore(self, file: Path) -> bool:
"""Checks if a file should be ignored based on .gitignore rules."""
Expand Down
91 changes: 86 additions & 5 deletions src/kit/repo_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,93 @@ def __init__(self, repo_path: str) -> None:
self._file_tree: Optional[List[Dict[str, Any]]] = None
self._gitignore_spec = self._load_gitignore()

def _adjust_gitignore_pattern(self, pattern: str, rel_base: Path) -> str:
"""Adjust a gitignore pattern to be relative to the repository root.

Args:
pattern: The pattern from a .gitignore file (already stripped, negation removed)
rel_base: Relative path from repo root to the .gitignore directory

Returns:
The adjusted pattern prefixed with the correct path
"""
if str(rel_base) == ".":
# Pattern is in root .gitignore - use as-is
return pattern

# Pattern is in subdirectory
if pattern.startswith("/"):
# Absolute pattern (relative to gitignore dir) - make relative to repo root
return f"{rel_base}/{pattern[1:]}"
else:
# Relative pattern - applies to directory and all subdirectories
# Use /** to match files at any depth under the directory
return f"{rel_base}/**/{pattern}"
Comment on lines +44 to +47
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pattern adjustment logic incorrectly applies recursive matching (**/) to all patterns in subdirectories. Patterns containing internal forward slashes (like src/config.js) should match only relative to the .gitignore directory, not at any depth.

View Details
📝 Patch Details
diff --git a/src/kit/code_searcher.py b/src/kit/code_searcher.py
index 0c7e5f2..806c62d 100644
--- a/src/kit/code_searcher.py
+++ b/src/kit/code_searcher.py
@@ -54,8 +54,14 @@ class CodeSearcher:
         if pattern.startswith("/"):
             # Absolute pattern (relative to gitignore dir) - make relative to repo root
             return f"{rel_base}/{pattern[1:]}"
+        elif "/" in pattern:
+            # Pattern with internal slash - keep relative to gitignore dir
+            # According to gitignore spec: "If there is a separator at the beginning
+            # or middle of the pattern, then the pattern is relative to the directory
+            # level of the .gitignore file itself"
+            return f"{rel_base}/{pattern}"
         else:
-            # Relative pattern - applies to directory and all subdirectories
+            # Pattern without slash - applies to directory and all subdirectories
             # Use /** to match files at any depth under the directory
             return f"{rel_base}/**/{pattern}"
 
diff --git a/src/kit/repo_mapper.py b/src/kit/repo_mapper.py
index 62ae31d..a011eb3 100644
--- a/src/kit/repo_mapper.py
+++ b/src/kit/repo_mapper.py
@@ -41,8 +41,14 @@ class RepoMapper:
         if pattern.startswith("/"):
             # Absolute pattern (relative to gitignore dir) - make relative to repo root
             return f"{rel_base}/{pattern[1:]}"
+        elif "/" in pattern:
+            # Pattern with internal slash - keep relative to gitignore dir
+            # According to gitignore spec: "If there is a separator at the beginning
+            # or middle of the pattern, then the pattern is relative to the directory
+            # level of the .gitignore file itself"
+            return f"{rel_base}/{pattern}"
         else:
-            # Relative pattern - applies to directory and all subdirectories
+            # Pattern without slash - applies to directory and all subdirectories
             # Use /** to match files at any depth under the directory
             return f"{rel_base}/**/{pattern}"
 

Analysis

Incorrect recursive matching for patterns with internal slashes in subdirectory .gitignore files

What fails: The _adjust_gitignore_pattern() method in both RepoMapper and CodeSearcher incorrectly applies recursive matching (**/) to patterns containing forward slashes when they appear in subdirectory .gitignore files. This causes patterns like src/config.js to match at arbitrary depths below the .gitignore directory instead of only relative to that directory.

How to reproduce:

  1. Create a subdirectory with .gitignore containing a pattern with an internal slash: frontend/.gitignore with pattern src/config.js
  2. Create files at frontend/src/config.js and frontend/something/src/config.js
  3. Call RepoMapper.get_file_tree() or CodeSearcher to check which files are ignored
  4. The pattern matches both files when it should only match frontend/src/config.js

Expected behavior: According to git documentation on gitignore patterns: "If there is a separator at the beginning or middle of the pattern, then the pattern is relative to the directory level of the .gitignore file itself." Patterns with internal slashes should NOT be treated as wildcard patterns that match at any depth.

Root cause: The code had three cases but was missing the distinction for patterns with internal slashes:

  • Patterns starting with / (absolute): handled correctly, removes leading / and prepends relative path
  • Patterns containing / (internal slash): MISSING - was being treated same as patterns without slashes
  • Patterns without /: correctly append **/ to match at any depth

The fix adds explicit handling for patterns containing internal slashes by prepending the relative base path without **/, keeping them relative to the .gitignore directory location.


def _load_gitignore(self):
gitignore_path = self.repo_path / ".gitignore"
if gitignore_path.exists():
with open(gitignore_path) as f:
return pathspec.PathSpec.from_lines("gitwildmatch", f)
return None
"""Load all .gitignore files in repository tree and merge them.

Returns a PathSpec that respects all .gitignore files, with proper
precedence (patterns from deeper directories can override root patterns).
"""
gitignore_files = []

# Collect all .gitignore files
for dirpath, dirnames, filenames in os.walk(self.repo_path):
if ".git" in Path(dirpath).parts:
continue
if ".gitignore" in filenames:
gitignore_files.append(Path(dirpath) / ".gitignore")

if not gitignore_files:
return None

# Sort by depth (shallowest first) for correct precedence
# Git processes .gitignore files from root to leaf, so later patterns can override earlier ones
gitignore_files.sort(key=lambda p: len(p.parts))

# Collect all patterns with proper path prefixes
all_patterns = []
for gitignore_path in gitignore_files:
try:
with open(gitignore_path, "r", encoding="utf-8") as f:
patterns = f.readlines()

# Calculate relative base path from repo root
try:
rel_base = gitignore_path.parent.relative_to(self.repo_path)
except ValueError:
continue # gitignore outside repo (shouldn't happen)

# Process each pattern
for pattern in patterns:
pattern = pattern.strip()
if not pattern or pattern.startswith("#"):
continue

# Handle negation patterns
is_negation = pattern.startswith("!")
if is_negation:
pattern = pattern[1:]

# Adjust pattern to be relative to repo root
adjusted = self._adjust_gitignore_pattern(pattern, rel_base)

# Re-add negation prefix if needed
if is_negation:
adjusted = f"!{adjusted}"

all_patterns.append(adjusted)

except Exception as e:
logging.warning(f"Could not load {gitignore_path}: {e}")
continue

if not all_patterns:
return None

# Create single merged pathspec
return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns)

def _should_ignore(self, file: Path) -> bool:
# Handle potential symlink resolution mismatches
Expand Down
44 changes: 44 additions & 0 deletions tests/integration/test_humanlayer_repo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest
from pathlib import Path
from kit.repo_mapper import RepoMapper
import subprocess


@pytest.mark.integration
@pytest.mark.skipif(
not Path("/home/selman/dev/humanlayer").exists(),
reason="Requires humanlayer repository"
)
def test_humanlayer_repo_gitignore():
"""Integration test: Verify fix works on actual humanlayer repo."""

# Get git's file count
result = subprocess.run(
["git", "ls-files"],
cwd="/home/selman/dev/humanlayer",
capture_output=True,
text=True
)
git_files = set(result.stdout.strip().split("\n"))
git_count = len(git_files)

# Get kit's file count
mapper = RepoMapper("/home/selman/dev/humanlayer")
tree = mapper.get_file_tree()
kit_count = len(tree)
kit_paths = {item["path"] for item in tree}

# Should be approximately equal (within 10% tolerance for build artifacts)
tolerance = 0.1
assert abs(kit_count - git_count) / git_count < tolerance, \
f"Kit returned {kit_count} files, Git tracks {git_count} files"

# Should be well under token limit (assuming ~100 chars per file path)
estimated_tokens = kit_count * 100
assert estimated_tokens < 25000, \
f"Estimated {estimated_tokens} tokens (exceeds 25k limit)"

# Verify no node_modules files included
node_modules_files = [p for p in kit_paths if "node_modules" in p]
assert len(node_modules_files) == 0, \
f"Found {len(node_modules_files)} node_modules files (should be 0)"
Loading