diff --git a/src/kit/code_searcher.py b/src/kit/code_searcher.py index 32112163..0c7e5f23 100644 --- a/src/kit/code_searcher.py +++ b/src/kit/code_searcher.py @@ -1,5 +1,7 @@ from __future__ import annotations +import logging +import os import re from dataclasses import dataclass from pathlib import Path @@ -34,17 +36,93 @@ def __init__(self, repo_path: str) -> None: self.repo_path: Path = Path(repo_path) self._gitignore_spec = self._load_gitignore() # Load gitignore spec + def _adjust_gitignore_pattern(self, pattern: str, rel_base: Path) -> str: + """Adjust a gitignore pattern to be relative to the repository root. + + Args: + pattern: The pattern from a .gitignore file (already stripped, negation removed) + rel_base: Relative path from repo root to the .gitignore directory + + Returns: + The adjusted pattern prefixed with the correct path + """ + if str(rel_base) == ".": + # Pattern is in root .gitignore - use as-is + return pattern + + # Pattern is in subdirectory + if pattern.startswith("/"): + # Absolute pattern (relative to gitignore dir) - make relative to repo root + return f"{rel_base}/{pattern[1:]}" + else: + # Relative pattern - applies to directory and all subdirectories + # Use /** to match files at any depth under the directory + return f"{rel_base}/**/{pattern}" + def _load_gitignore(self): - """Loads .gitignore rules from the repository root.""" - gitignore_path = self.repo_path / ".gitignore" - if gitignore_path.exists(): + """Load all .gitignore files in repository tree and merge them. + + Returns a PathSpec that respects all .gitignore files, with proper + precedence (patterns from deeper directories can override root patterns). + """ + gitignore_files = [] + + # Collect all .gitignore files + for dirpath, dirnames, filenames in os.walk(self.repo_path): + if ".git" in Path(dirpath).parts: + continue + if ".gitignore" in filenames: + gitignore_files.append(Path(dirpath) / ".gitignore") + + if not gitignore_files: + return None + + # Sort by depth (shallowest first) for correct precedence + # Git processes .gitignore files from root to leaf, so later patterns can override earlier ones + gitignore_files.sort(key=lambda p: len(p.parts)) + + # Collect all patterns with proper path prefixes + all_patterns = [] + for gitignore_path in gitignore_files: try: with open(gitignore_path, "r", encoding="utf-8") as f: - return pathspec.PathSpec.from_lines("gitwildmatch", f) + patterns = f.readlines() + + # Calculate relative base path from repo root + try: + rel_base = gitignore_path.parent.relative_to(self.repo_path) + except ValueError: + continue # gitignore outside repo (shouldn't happen) + + # Process each pattern + for pattern in patterns: + pattern = pattern.strip() + if not pattern or pattern.startswith("#"): + continue + + # Handle negation patterns + is_negation = pattern.startswith("!") + if is_negation: + pattern = pattern[1:] + + # Adjust pattern to be relative to repo root + adjusted = self._adjust_gitignore_pattern(pattern, rel_base) + + # Re-add negation prefix if needed + if is_negation: + adjusted = f"!{adjusted}" + + all_patterns.append(adjusted) + except Exception as e: - # Log this error if logging is set up, or print - print(f"Warning: Could not load .gitignore: {e}") - return None + logging.warning(f"Could not load {gitignore_path}: {e}") + continue + + if not all_patterns: + return None + + # Create single merged pathspec + return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns) def _should_ignore(self, file: Path) -> bool: """Checks if a file should be ignored based on .gitignore rules.""" diff --git a/src/kit/repo_mapper.py b/src/kit/repo_mapper.py index 8d0dee55..62ae31d8 100644 --- a/src/kit/repo_mapper.py +++ b/src/kit/repo_mapper.py @@ -23,12 +23,93 @@ def __init__(self, repo_path: str) -> None: self._file_tree: Optional[List[Dict[str, Any]]] = None self._gitignore_spec = self._load_gitignore() + def _adjust_gitignore_pattern(self, pattern: str, rel_base: Path) -> str: + """Adjust a gitignore pattern to be relative to the repository root. + + Args: + pattern: The pattern from a .gitignore file (already stripped, negation removed) + rel_base: Relative path from repo root to the .gitignore directory + + Returns: + The adjusted pattern prefixed with the correct path + """ + if str(rel_base) == ".": + # Pattern is in root .gitignore - use as-is + return pattern + + # Pattern is in subdirectory + if pattern.startswith("/"): + # Absolute pattern (relative to gitignore dir) - make relative to repo root + return f"{rel_base}/{pattern[1:]}" + else: + # Relative pattern - applies to directory and all subdirectories + # Use /** to match files at any depth under the directory + return f"{rel_base}/**/{pattern}" + def _load_gitignore(self): - gitignore_path = self.repo_path / ".gitignore" - if gitignore_path.exists(): - with open(gitignore_path) as f: - return pathspec.PathSpec.from_lines("gitwildmatch", f) - return None + """Load all .gitignore files in repository tree and merge them. + + Returns a PathSpec that respects all .gitignore files, with proper + precedence (patterns from deeper directories can override root patterns). + """ + gitignore_files = [] + + # Collect all .gitignore files + for dirpath, dirnames, filenames in os.walk(self.repo_path): + if ".git" in Path(dirpath).parts: + continue + if ".gitignore" in filenames: + gitignore_files.append(Path(dirpath) / ".gitignore") + + if not gitignore_files: + return None + + # Sort by depth (shallowest first) for correct precedence + # Git processes .gitignore files from root to leaf, so later patterns can override earlier ones + gitignore_files.sort(key=lambda p: len(p.parts)) + + # Collect all patterns with proper path prefixes + all_patterns = [] + for gitignore_path in gitignore_files: + try: + with open(gitignore_path, "r", encoding="utf-8") as f: + patterns = f.readlines() + + # Calculate relative base path from repo root + try: + rel_base = gitignore_path.parent.relative_to(self.repo_path) + except ValueError: + continue # gitignore outside repo (shouldn't happen) + + # Process each pattern + for pattern in patterns: + pattern = pattern.strip() + if not pattern or pattern.startswith("#"): + continue + + # Handle negation patterns + is_negation = pattern.startswith("!") + if is_negation: + pattern = pattern[1:] + + # Adjust pattern to be relative to repo root + adjusted = self._adjust_gitignore_pattern(pattern, rel_base) + + # Re-add negation prefix if needed + if is_negation: + adjusted = f"!{adjusted}" + + all_patterns.append(adjusted) + + except Exception as e: + logging.warning(f"Could not load {gitignore_path}: {e}") + continue + + if not all_patterns: + return None + + # Create single merged pathspec + return pathspec.PathSpec.from_lines("gitwildmatch", all_patterns) def _should_ignore(self, file: Path) -> bool: # Handle potential symlink resolution mismatches diff --git a/tests/integration/test_humanlayer_repo.py b/tests/integration/test_humanlayer_repo.py new file mode 100644 index 00000000..712b50cb --- /dev/null +++ b/tests/integration/test_humanlayer_repo.py @@ -0,0 +1,44 @@ +import pytest +from pathlib import Path +from kit.repo_mapper import RepoMapper +import subprocess + + +@pytest.mark.integration +@pytest.mark.skipif( + not Path("/home/selman/dev/humanlayer").exists(), + reason="Requires humanlayer repository" +) +def test_humanlayer_repo_gitignore(): + """Integration test: Verify fix works on actual humanlayer repo.""" + + # Get git's file count + result = subprocess.run( + ["git", "ls-files"], + cwd="/home/selman/dev/humanlayer", + capture_output=True, + text=True + ) + git_files = set(result.stdout.strip().split("\n")) + git_count = len(git_files) + + # Get kit's file count + mapper = RepoMapper("/home/selman/dev/humanlayer") + tree = mapper.get_file_tree() + kit_count = len(tree) + kit_paths = {item["path"] for item in tree} + + # Should be approximately equal (within 10% tolerance for build artifacts) + tolerance = 0.1 + assert abs(kit_count - git_count) / git_count < tolerance, \ + f"Kit returned {kit_count} files, Git tracks {git_count} files" + + # Should be well under token limit (assuming ~100 chars per file path) + estimated_tokens = kit_count * 100 + assert estimated_tokens < 25000, \ + f"Estimated {estimated_tokens} tokens (exceeds 25k limit)" + + # Verify no node_modules files included + node_modules_files = [p for p in kit_paths if "node_modules" in p] + assert len(node_modules_files) == 0, \ + f"Found {len(node_modules_files)} node_modules files (should be 0)" diff --git a/tests/test_gitignore.py b/tests/test_gitignore.py new file mode 100644 index 00000000..8825c9f9 --- /dev/null +++ b/tests/test_gitignore.py @@ -0,0 +1,303 @@ +import pytest +from pathlib import Path +import tempfile +from kit.repo_mapper import RepoMapper + + +def test_root_gitignore_only(): + """Test basic root .gitignore works as before.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Create root .gitignore + (repo / ".gitignore").write_text("*.pyc\n__pycache__/\n") + + # Create test files + (repo / "test.py").touch() + (repo / "test.pyc").touch() + (repo / "__pycache__").mkdir() + (repo / "__pycache__" / "test.pyc").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # Should only include test.py, not .pyc or __pycache__ + paths = [item["path"] for item in tree] + assert "test.py" in paths + assert "test.pyc" not in paths + assert "__pycache__/test.pyc" not in paths + + +def test_subdirectory_gitignore(): + """Test subdirectory .gitignore files are respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Create subdirectory with its own .gitignore + subdir = repo / "frontend" + subdir.mkdir() + (subdir / ".gitignore").write_text("node_modules/\n*.log\n") + + # Create test files + (subdir / "app.js").touch() + (subdir / "debug.log").touch() + node_modules = subdir / "node_modules" + node_modules.mkdir() + (node_modules / "package.json").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # Should include app.js but not debug.log or node_modules + paths = [item["path"] for item in tree] + assert "frontend/app.js" in paths + assert "frontend/debug.log" not in paths + assert "frontend/node_modules/package.json" not in paths + + +def test_nested_gitignore_precedence(): + """Test deeper .gitignore files override shallower ones.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Root .gitignore ignores *.tmp + (repo / ".gitignore").write_text("*.tmp\n") + + # Subdirectory .gitignore allows *.tmp (negation) + subdir = repo / "special" + subdir.mkdir() + (subdir / ".gitignore").write_text("!*.tmp\n") + + # Create test files + (repo / "root.tmp").touch() + (subdir / "special.tmp").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # Root .tmp should be ignored, but special/ .tmp should be included + paths = [item["path"] for item in tree] + assert "root.tmp" not in paths + assert "special/special.tmp" in paths # Negation pattern + + +def test_multiple_subdirectory_gitignores(): + """Test multiple subdirectories each with .gitignore files.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Frontend with node_modules + frontend = repo / "frontend" + frontend.mkdir() + (frontend / ".gitignore").write_text("node_modules/\n") + (frontend / "app.js").touch() + fe_nm = frontend / "node_modules" + fe_nm.mkdir() + (fe_nm / "react.js").touch() + + # Backend with venv + backend = repo / "backend" + backend.mkdir() + (backend / ".gitignore").write_text("venv/\n__pycache__/\n") + (backend / "main.py").touch() + be_venv = backend / "venv" + be_venv.mkdir() + (be_venv / "python").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + paths = [item["path"] for item in tree] + + # Should include source files + assert "frontend/app.js" in paths + assert "backend/main.py" in paths + + # Should exclude ignored directories + assert "frontend/node_modules/react.js" not in paths + assert "backend/venv/python" not in paths + + +def test_no_gitignore_files(): + """Test repository with no .gitignore files.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Create files without .gitignore + (repo / "test.py").touch() + subdir = repo / "src" + subdir.mkdir() + (subdir / "main.py").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + # All files should be included + paths = [item["path"] for item in tree] + assert "test.py" in paths + assert "src/main.py" in paths + + +def test_code_searcher_respects_subdirectory_gitignore(): + """Test CodeSearcher also respects subdirectory .gitignore files.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + from kit.code_searcher import CodeSearcher + + # Create subdirectory with its own .gitignore + subdir = repo / "src" + subdir.mkdir() + (subdir / ".gitignore").write_text("*.log\n") + + # Create test files with searchable content + (repo / "root.py").write_text("search_pattern") + (subdir / "code.py").write_text("search_pattern") + (subdir / "debug.log").write_text("search_pattern") + + searcher = CodeSearcher(str(repo)) + results = searcher.search_text("search_pattern") + + # Should find matches in .py but not .log + files = [r["file"] for r in results] + assert "root.py" in files + assert "src/code.py" in files + assert "src/debug.log" not in files + + +def test_absolute_patterns_in_subdirectory(): + """Test absolute patterns (starting with /) in subdirectory .gitignore.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Subdirectory with absolute pattern + subdir = repo / "frontend" + subdir.mkdir() + (subdir / ".gitignore").write_text("/build/\n") + + # Create test files + (subdir / "src").mkdir() + (subdir / "src" / "app.js").touch() + (subdir / "build").mkdir() + (subdir / "build" / "bundle.js").touch() + (subdir / "src" / "build").mkdir() + (subdir / "src" / "build" / "config.js").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + paths = [item["path"] for item in tree] + # /build/ should only ignore frontend/build/, not frontend/src/build/ + assert "frontend/src/app.js" in paths + assert "frontend/build/bundle.js" not in paths + assert "frontend/src/build/config.js" in paths + + +def test_complex_negation_patterns(): + """Test complex negation scenarios.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Root ignores all .env files + (repo / ".gitignore").write_text("*.env\n") + + # Config directory allows .env.example + config = repo / "config" + config.mkdir() + (config / ".gitignore").write_text("!*.env.example\n") + + # Create test files + (repo / "root.env").touch() + (repo / "README.md").touch() + (config / "app.env").touch() + (config / "template.env.example").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + paths = [item["path"] for item in tree] + assert "README.md" in paths + assert "root.env" not in paths + assert "config/app.env" not in paths + assert "config/template.env.example" in paths # Negation allows it + + +def test_patterns_without_wildcards_match_at_any_depth(): + """Test that patterns without wildcards (like 'node_modules') match at any depth.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Frontend with .gitignore containing plain 'node_modules' (no wildcards) + frontend = repo / "frontend" + frontend.mkdir() + (frontend / ".gitignore").write_text("node_modules\n") + + # Create node_modules at multiple depths + (frontend / "package.json").touch() + + # Direct child + nm1 = frontend / "node_modules" + nm1.mkdir() + (nm1 / "pkg1.json").touch() + + # Nested in src + src = frontend / "src" + src.mkdir() + nm2 = src / "node_modules" + nm2.mkdir() + (nm2 / "pkg2.json").touch() + + # Deeply nested + deep = src / "components" / "ui" + deep.mkdir(parents=True) + nm3 = deep / "node_modules" + nm3.mkdir() + (nm3 / "pkg3.json").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + paths = [item["path"] for item in tree] + + # package.json should be included + assert "frontend/package.json" in paths + + # All node_modules at any depth should be ignored + assert "frontend/node_modules/pkg1.json" not in paths + assert "frontend/src/node_modules/pkg2.json" not in paths + assert "frontend/src/components/ui/node_modules/pkg3.json" not in paths + + +def test_deeply_nested_gitignores(): + """Test .gitignore files at multiple depth levels.""" + with tempfile.TemporaryDirectory() as tmpdir: + repo = Path(tmpdir) + + # Root .gitignore + (repo / ".gitignore").write_text("*.tmp\n") + + # Level 1 + l1 = repo / "level1" + l1.mkdir() + (l1 / ".gitignore").write_text("*.cache\n") + (l1 / "file.txt").touch() + (l1 / "file.tmp").touch() + (l1 / "file.cache").touch() + + # Level 2 + l2 = l1 / "level2" + l2.mkdir() + (l2 / ".gitignore").write_text("!*.tmp\n") # Re-allow .tmp here + (l2 / "deep.txt").touch() + (l2 / "deep.tmp").touch() + (l2 / "deep.cache").touch() + + mapper = RepoMapper(str(repo)) + tree = mapper.get_file_tree() + + paths = [item["path"] for item in tree] + assert "level1/file.txt" in paths + assert "level1/file.tmp" not in paths # Ignored by root + assert "level1/file.cache" not in paths # Ignored by level1 + assert "level1/level2/deep.txt" in paths + assert "level1/level2/deep.tmp" in paths # Negation allows it + assert "level1/level2/deep.cache" not in paths # Still ignored by level1