diff --git a/review_roadmap/agent/nodes.py b/review_roadmap/agent/nodes.py index ae81c41..71b56c9 100644 --- a/review_roadmap/agent/nodes.py +++ b/review_roadmap/agent/nodes.py @@ -312,6 +312,53 @@ def _build_fetched_content_str(fetched_content: Dict[str, str]) -> str: return "".join(parts) +# Maximum characters per diff before truncation +MAX_DIFF_CHARS = 1500 +# Maximum total characters for all diffs combined +MAX_TOTAL_DIFF_CHARS = 80000 + + +def _build_diffs_context(state: ReviewState) -> str: + """Build formatted diff content for all changed files. + + Includes the actual unified diff for each file so the LLM can see + the code changes, not just file names. Large diffs are truncated + to manage token limits. + + Args: + state: Current workflow state containing PR context. + + Returns: + Formatted string with all diffs, suitable for LLM prompt. + """ + if not state.pr_context.files: + return "No files changed." + + parts = [] + total_chars = 0 + + for f in state.pr_context.files: + if not f.diff_content: + # Binary files or very large files may not have diff content + file_section = f"### {f.path} ({f.status}, +{f.additions}/-{f.deletions})\n[No diff available - binary or large file]\n" + else: + diff = f.diff_content + if len(diff) > MAX_DIFF_CHARS: + diff = diff[:MAX_DIFF_CHARS] + f"\n... (truncated, {len(f.diff_content)} chars total)" + file_section = f"### {f.path} ({f.status}, +{f.additions}/-{f.deletions})\n```diff\n{diff}\n```\n" + + # Check if adding this would exceed our total budget + if total_chars + len(file_section) > MAX_TOTAL_DIFF_CHARS: + remaining = len(state.pr_context.files) - len(parts) + parts.append(f"\n... ({remaining} more files not shown due to size limits)\n") + break + + parts.append(file_section) + total_chars += len(file_section) + + return "\n".join(parts) + + def draft_roadmap(state: ReviewState) -> Dict[str, Any]: """Generate the final Markdown review roadmap. @@ -332,6 +379,7 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]: iteration=state.reflection_iterations) files_context = _build_files_context(state) + diffs_context = _build_diffs_context(state) comments_context = _build_comments_context(state) fetched_context_str = _build_fetched_content_str(state.fetched_content) @@ -357,9 +405,12 @@ def draft_roadmap(state: ReviewState) -> Dict[str, Any]: Topology Analysis: {state.topology.get('analysis', 'No analysis')} - Files (with base links): + Files (with deep links for review): {chr(10).join(files_context)} + ## File Diffs (actual code changes) + {diffs_context} + Existing Comments: {chr(10).join(comments_context) if comments_context else "No comments found."} {fetched_context_str} diff --git a/review_roadmap/agent/prompts.py b/review_roadmap/agent/prompts.py index 8a709fc..4ccea59 100644 --- a/review_roadmap/agent/prompts.py +++ b/review_roadmap/agent/prompts.py @@ -15,12 +15,27 @@ You have the PR metadata, file list, and topology. Review the "High Risk" files or ambiguous changes. -If you need to see the FULL content of a file (not just the diff) to understand it, use the `read_file` tool. -For example: -- If a class hierarchy changed, read the parent class file. -- If a complex logic change involves helper functions, read the file definition. +## The `read_file` Tool -Do not fetch files unless necessary. If the diff is sufficient, just return "DONE". +You can use `read_file` to fetch the FULL content of ANY file in the repository - not just files in the PR diff. This is useful for: + +1. **Verifying imports exist**: If the PR imports from modules not in the diff (e.g., `from myproject.utils import helper`), fetch the file to confirm it exists and understand its interface. + +2. **Understanding parent classes**: If a class inherits from a base class not in the diff, fetch it to understand the contract. + +3. **Checking helper functions**: If the PR calls functions defined elsewhere, fetch them to understand the context. + +4. **Validating configuration references**: If code references config files or constants, fetch them. + +## Examples +- PR imports `from temperature_agent.tools.memory import store_house_knowledge` → fetch `temperature_agent/tools/memory.py` +- PR extends `BaseProcessor` from `core/processors.py` → fetch `core/processors.py` +- PR uses `settings.API_KEY` → fetch `config/settings.py` + +## Guidelines +- Do not fetch files unless necessary. If the diff is self-explanatory, just return "DONE". +- Prioritize fetching files that help verify the PR is complete (e.g., are all required interfaces implemented?). +- If a file doesn't exist, that's valuable information - it suggests missing files in the PR. """ DRAFT_ROADMAP_SYSTEM_PROMPT = """You are a benevolent Senior Staff Engineer guiding a junior reviewer. diff --git a/review_roadmap/agent/tools.py b/review_roadmap/agent/tools.py index 0075701..eaf074c 100644 --- a/review_roadmap/agent/tools.py +++ b/review_roadmap/agent/tools.py @@ -6,9 +6,20 @@ @tool def read_file(path: str) -> Optional[str]: """ - Reads the full content of a specific file from the PR. - Use this when you need to see the code context surrounding a change, - or to understand a whole class/module definition. + Reads the full content of ANY file from the repository at the PR's head commit. + + This fetches files from the entire codebase, not just files in the PR diff. + Use this to: + - Verify that imported modules exist (e.g., fetch 'myproject/utils/helpers.py') + - Understand parent classes or interfaces referenced by the PR + - Check helper functions or utilities called by changed code + - Confirm configuration or constant values referenced in the PR + + Args: + path: Full path to the file from the repository root (e.g., 'src/utils/auth.py') + + Returns: + The file content, or an error message if the file doesn't exist. """ # The actual implementation happens in the node, this is just for schema binding return None diff --git a/review_roadmap/github/client.py b/review_roadmap/github/client.py index 651602d..2c9fe71 100644 --- a/review_roadmap/github/client.py +++ b/review_roadmap/github/client.py @@ -84,6 +84,9 @@ def _fetch_pr_metadata(self, owner: str, repo: str, pr_number: int) -> PRMetadat def _fetch_file_diffs(self, owner: str, repo: str, pr_number: int) -> List[FileDiff]: """Fetch the list of changed files with their diffs. + Handles pagination to fetch all files, not just the first page. + GitHub defaults to 30 files per page; we request 100 (max allowed). + Args: owner: Repository owner. repo: Repository name. @@ -95,19 +98,39 @@ def _fetch_file_diffs(self, owner: str, repo: str, pr_number: int) -> List[FileD Raises: httpx.HTTPStatusError: If the API request fails. """ - files_resp = self.client.get(f"/repos/{owner}/{repo}/pulls/{pr_number}/files") - files_resp.raise_for_status() + all_files: List[FileDiff] = [] + page = 1 + per_page = 100 # Max allowed by GitHub API - return [ - FileDiff( - path=f["filename"], - status=f["status"], - additions=f["additions"], - deletions=f["deletions"], - diff_content=f.get("patch", "") # Patch might be missing for binary/large files + while True: + files_resp = self.client.get( + f"/repos/{owner}/{repo}/pulls/{pr_number}/files", + params={"page": page, "per_page": per_page} ) - for f in files_resp.json() - ] + files_resp.raise_for_status() + files_data = files_resp.json() + + if not files_data: + break # No more files + + all_files.extend([ + FileDiff( + path=f["filename"], + status=f["status"], + additions=f["additions"], + deletions=f["deletions"], + diff_content=f.get("patch", "") # Patch might be missing for binary/large files + ) + for f in files_data + ]) + + # If we got fewer than per_page, we've reached the last page + if len(files_data) < per_page: + break + + page += 1 + + return all_files def _fetch_issue_comments(self, owner: str, repo: str, pr_number: int) -> List[PRComment]: """Fetch general conversation comments from the issues endpoint.