diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index c7949bd3..1e90df52 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -599,9 +599,9 @@ def main() -> None: parser.set_defaults(dataset="wentingzhao/commit0_combined") args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm_config_path = args.llm_config_path if not os.path.isfile(llm_config_path): @@ -633,7 +633,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=None, - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=create_critic(args), selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/gaia/README.md b/benchmarks/gaia/README.md index 4b1da6a0..ba46e934 100644 --- a/benchmarks/gaia/README.md +++ b/benchmarks/gaia/README.md @@ -62,7 +62,7 @@ uv run python -m benchmarks.gaia.get_score --file outputs/gaia/output.jsonl - `--output-dir`: Base directory for outputs (default: `outputs`) - `--n-limit`: Limit number of instances to evaluate (default: 0 = all) - `--num-workers`: Number of parallel workers (default: 1) -- `--max-attempts`: Maximum attempts for iterative mode (default: 1) +- `--n-critic-runs`: Number of critic evaluation runs for iterative mode (default: 1) - `--note`: Optional note to add to output directory name diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 9a0a700d..55593fc2 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -558,8 +558,8 @@ def main() -> None: logger.info(f"Using critic: {type(critic).__name__}") # Validate arguments - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") # Load LLM config llm_config_path = args.llm_config_path @@ -591,7 +591,7 @@ def main() -> None: eval_output_dir=structured_output_dir, details={"level": args.level}, eval_limit=args.n_limit, - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, workspace_type=args.workspace, diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 710dd359..fcc184d0 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -436,9 +436,9 @@ def main() -> None: ) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm_config_path = args.llm_config_path if not os.path.isfile(llm_config_path): @@ -475,7 +475,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index 04890359..9238b02e 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -537,8 +537,8 @@ def main() -> None: args = parser.parse_args() # Validate args - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") # Load LLM config llm_config_path = args.llm_config_path @@ -577,7 +577,7 @@ def main() -> None: "platform": "linux/amd64", }, eval_limit=args.n_limit, - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, ) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 960a5a42..fed3dc18 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -327,9 +327,9 @@ def main() -> None: ) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm_config_path = args.llm_config_path if not os.path.isfile(llm_config_path): @@ -365,7 +365,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 7e09f54b..5fd773e1 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -392,9 +392,9 @@ def main() -> None: parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev") args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm_config_path = args.llm_config_path if not os.path.isfile(llm_config_path): @@ -430,7 +430,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swtbench/README.md b/benchmarks/swtbench/README.md index c50d1e03..7c7e2d7d 100644 --- a/benchmarks/swtbench/README.md +++ b/benchmarks/swtbench/README.md @@ -31,7 +31,7 @@ python -m benchmarks.swtbench.run_infer \ --max-iterations 30 \ --select instances.txt \ --eval-output-dir ./evaluation_results \ - --max-attempts 3 \ + --n-critic-runs 3 \ --critic finish_with_patch ``` diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index c8cb3815..7ca06be9 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -356,9 +356,9 @@ def main() -> None: ) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm_config_path = args.llm_config_path if not os.path.isfile(llm_config_path): @@ -392,7 +392,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 60f08d73..c4619021 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -54,10 +54,10 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: help="Limit number of instances to evaluate", ) parser.add_argument( - "--max-attempts", + "--n-critic-runs", type=int, default=3, - help="Maximum number of attempts for iterative mode (default: 3, min: 1)", + help="Number of critic evaluation runs for iterative mode (default: 3, min: 1)", ) # Add critic arguments diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index cf4ba2cc..c818406c 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -185,13 +185,13 @@ def run( """ Run evaluation with iterative mode support. - If max_attempts > 1, will retry failed instances multiple times. - If max_attempts == 1, will run once without retries. + If n_critic_runs > 1, will retry failed instances multiple times. + If n_critic_runs == 1, will run once without retries. """ logger.info("Starting evaluation (process pool)") logger.info("metadata=%s", self.metadata) logger.info("workers=%d", self.num_workers) - logger.info("max_attempts=%d", self.metadata.max_attempts) + logger.info("n_critic_runs=%d", self.metadata.n_critic_runs) # Use iterative mode for all cases return self._run_iterative_mode(on_result=on_result) @@ -276,9 +276,9 @@ def _run_iterative_mode( critic = self.metadata.critic all_outputs: List[EvalOutput] = [] - for attempt in range(1, self.metadata.max_attempts + 1): + for attempt in range(1, self.metadata.n_critic_runs + 1): self.current_attempt = attempt - logger.info(f"Starting attempt {attempt}/{self.metadata.max_attempts}") + logger.info(f"Starting attempt {attempt}/{self.metadata.n_critic_runs}") instances_to_process = self._get_instances_for_attempt( attempt, all_instances, critic @@ -393,14 +393,14 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: logger.info("Aggregating results from all attempts") aggregate_results( output_dir=self.metadata.eval_output_dir, - max_attempts=self.metadata.max_attempts, + n_critic_runs=self.metadata.n_critic_runs, critic=self.metadata.critic, final_output_file="output.jsonl", ) logger.info( f"Evaluation complete: {total_instances} total instances, " - f"{self.metadata.max_attempts} max attempts" + f"{self.metadata.n_critic_runs} max attempts" ) return all_outputs diff --git a/benchmarks/utils/iterative.py b/benchmarks/utils/iterative.py index 02560e76..c72810ff 100644 --- a/benchmarks/utils/iterative.py +++ b/benchmarks/utils/iterative.py @@ -81,7 +81,7 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan def aggregate_results( output_dir: str, - max_attempts: int, + n_critic_runs: int, critic: "CriticBase", final_output_file: str = "output.jsonl", ) -> None: @@ -93,17 +93,17 @@ def aggregate_results( Args: output_dir: Directory containing attempt files - max_attempts: Maximum number of attempts + n_critic_runs: Maximum number of attempts critic: Critic instance to use for evaluation final_output_file: Name of the final output file """ - logger.info(f"Aggregating results from {max_attempts} attempts") + logger.info(f"Aggregating results from {n_critic_runs} attempts") # Dictionary to store the best result for each instance best_results: dict[EvalInstanceID, EvalOutput] = {} # Work backwards from the last attempt to the first - for attempt in range(max_attempts, 0, -1): + for attempt in range(n_critic_runs, 0, -1): attempt_file = os.path.join( output_dir, f"output.critic_attempt_{attempt}.jsonl" ) diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index f04b405d..b5ed5d68 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -27,8 +27,8 @@ class EvalMetadata(BaseModel): eval_limit: int = Field( default=0, description="Number of instances to evaluate, 0 means all" ) - max_attempts: int = Field( - default=1, ge=1, description="Maximum number of attempts for iterative mode" + n_critic_runs: int = Field( + default=1, ge=1, description="Number of critic evaluation runs for iterative mode" ) critic: CriticBase = Field( description=( diff --git a/tests/test_aggregate_results.py b/tests/test_aggregate_results.py index 4ace3462..8a5d096b 100644 --- a/tests/test_aggregate_results.py +++ b/tests/test_aggregate_results.py @@ -98,7 +98,7 @@ def test_prefers_non_error_over_error_when_last_attempt_errors( f.write(output_3.model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl contains the instance (not dropped) final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -135,7 +135,7 @@ def test_prefers_critic_success_over_non_error_critic_fail(self, temp_output_dir f.write(output.model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl contains the instance final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -185,7 +185,7 @@ def test_multiple_instances_with_mixed_results(self, temp_output_dir): f.write(create_output("instance_3", error=None).model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify all instances appear in output.jsonl final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -214,7 +214,7 @@ def test_all_attempts_error_instance_dropped(self, temp_output_dir): f.write(output.model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl is empty (instance dropped because all attempts errored) final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -228,7 +228,7 @@ def test_empty_attempts(self, temp_output_dir): critic = PassCritic() # Run aggregation with no attempt files - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl is created but empty final_output_file = os.path.join(temp_output_dir, "output.jsonl") diff --git a/tests/test_iterative_resume.py b/tests/test_iterative_resume.py index 93b0e917..2ae43906 100644 --- a/tests/test_iterative_resume.py +++ b/tests/test_iterative_resume.py @@ -58,8 +58,8 @@ def test_iterative_resume_with_expanded_n_limit(): Test that iterative evaluation correctly handles resume when n-limit is expanded. Scenario: - 1. First run: Process 50 instances with max_attempts=3 - 2. Second run: Expand to 200 instances with max_attempts=3 + 1. First run: Process 50 instances with n_critic_runs=3 + 2. Second run: Expand to 200 instances with n_critic_runs=3 Expected behavior: - The 150 new instances (51-200) should be processed starting from attempt 1 @@ -109,7 +109,7 @@ def test_iterative_resume_with_expanded_n_limit(): eval_output_dir=tmpdir, details={}, eval_limit=200, - max_attempts=3, + n_critic_runs=3, max_retries=0, critic=PassCritic(), ) @@ -191,7 +191,7 @@ def test_iterative_resume_with_same_n_limit(): eval_output_dir=tmpdir, details={}, eval_limit=50, - max_attempts=3, + n_critic_runs=3, max_retries=0, critic=PassCritic(), ) diff --git a/tests/test_keyboard_interrupt.py b/tests/test_keyboard_interrupt.py index 3c669e54..e8940b0c 100644 --- a/tests/test_keyboard_interrupt.py +++ b/tests/test_keyboard_interrupt.py @@ -74,7 +74,7 @@ def evaluate_instance( eval_output_dir="{tmpdir}", details={{}}, eval_limit=0, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) diff --git a/tests/test_workspace_cleanup.py b/tests/test_workspace_cleanup.py index 1af4e764..646765bb 100644 --- a/tests/test_workspace_cleanup.py +++ b/tests/test_workspace_cleanup.py @@ -36,7 +36,7 @@ def test_workspace_cleanup_called_on_success(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -87,7 +87,7 @@ def test_workspace_cleanup_called_on_failure(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -147,7 +147,7 @@ def test_workspace_cleanup_handles_cleanup_exception(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -203,7 +203,7 @@ def create_mock_workspace(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=2, critic=PassCritic(), )