Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,9 +599,9 @@ def main() -> None:
parser.set_defaults(dataset="wentingzhao/commit0_combined")
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
Expand Down Expand Up @@ -633,7 +633,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=None,
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=create_critic(args),
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/gaia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ uv run python -m benchmarks.gaia.get_score --file outputs/gaia/output.jsonl
- `--output-dir`: Base directory for outputs (default: `outputs`)
- `--n-limit`: Limit number of instances to evaluate (default: 0 = all)
- `--num-workers`: Number of parallel workers (default: 1)
- `--max-attempts`: Maximum attempts for iterative mode (default: 1)
- `--n-critic-runs`: Number of critic evaluation runs for iterative mode (default: 1)
- `--note`: Optional note to add to output directory name


Expand Down
6 changes: 3 additions & 3 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,8 +558,8 @@ def main() -> None:
logger.info(f"Using critic: {type(critic).__name__}")

# Validate arguments
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

# Load LLM config
llm_config_path = args.llm_config_path
Expand Down Expand Up @@ -591,7 +591,7 @@ def main() -> None:
eval_output_dir=structured_output_dir,
details={"level": args.level},
eval_limit=args.n_limit,
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
workspace_type=args.workspace,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,9 +436,9 @@ def main() -> None:
)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
Expand Down Expand Up @@ -475,7 +475,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/openagentsafety/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,8 @@ def main() -> None:
args = parser.parse_args()

# Validate args
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

# Load LLM config
llm_config_path = args.llm_config_path
Expand Down Expand Up @@ -577,7 +577,7 @@ def main() -> None:
"platform": "linux/amd64",
},
eval_limit=args.n_limit,
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
)
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,9 @@ def main() -> None:
)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
Expand Down Expand Up @@ -365,7 +365,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,9 +392,9 @@ def main() -> None:
parser.set_defaults(dataset="princeton-nlp/SWE-bench_Multimodal", split="dev")
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
Expand Down Expand Up @@ -430,7 +430,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/swtbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ python -m benchmarks.swtbench.run_infer \
--max-iterations 30 \
--select instances.txt \
--eval-output-dir ./evaluation_results \
--max-attempts 3 \
--n-critic-runs 3 \
--critic finish_with_patch
```

Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,9 +356,9 @@ def main() -> None:
)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
Expand Down Expand Up @@ -392,7 +392,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/utils/args_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
help="Limit number of instances to evaluate",
)
parser.add_argument(
"--max-attempts",
"--n-critic-runs",
type=int,
default=3,
help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
help="Number of critic evaluation runs for iterative mode (default: 3, min: 1)",
)

# Add critic arguments
Expand Down
14 changes: 7 additions & 7 deletions benchmarks/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,13 +185,13 @@ def run(
"""
Run evaluation with iterative mode support.

If max_attempts > 1, will retry failed instances multiple times.
If max_attempts == 1, will run once without retries.
If n_critic_runs > 1, will retry failed instances multiple times.
If n_critic_runs == 1, will run once without retries.
"""
logger.info("Starting evaluation (process pool)")
logger.info("metadata=%s", self.metadata)
logger.info("workers=%d", self.num_workers)
logger.info("max_attempts=%d", self.metadata.max_attempts)
logger.info("n_critic_runs=%d", self.metadata.n_critic_runs)

# Use iterative mode for all cases
return self._run_iterative_mode(on_result=on_result)
Expand Down Expand Up @@ -276,9 +276,9 @@ def _run_iterative_mode(
critic = self.metadata.critic
all_outputs: List[EvalOutput] = []

for attempt in range(1, self.metadata.max_attempts + 1):
for attempt in range(1, self.metadata.n_critic_runs + 1):
self.current_attempt = attempt
logger.info(f"Starting attempt {attempt}/{self.metadata.max_attempts}")
logger.info(f"Starting attempt {attempt}/{self.metadata.n_critic_runs}")

instances_to_process = self._get_instances_for_attempt(
attempt, all_instances, critic
Expand Down Expand Up @@ -393,14 +393,14 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
logger.info("Aggregating results from all attempts")
aggregate_results(
output_dir=self.metadata.eval_output_dir,
max_attempts=self.metadata.max_attempts,
n_critic_runs=self.metadata.n_critic_runs,
critic=self.metadata.critic,
final_output_file="output.jsonl",
)

logger.info(
f"Evaluation complete: {total_instances} total instances, "
f"{self.metadata.max_attempts} max attempts"
f"{self.metadata.n_critic_runs} max attempts"
)
return all_outputs

Expand Down
8 changes: 4 additions & 4 deletions benchmarks/utils/iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan

def aggregate_results(
output_dir: str,
max_attempts: int,
n_critic_runs: int,
critic: "CriticBase",
final_output_file: str = "output.jsonl",
) -> None:
Expand All @@ -93,17 +93,17 @@ def aggregate_results(

Args:
output_dir: Directory containing attempt files
max_attempts: Maximum number of attempts
n_critic_runs: Maximum number of attempts
critic: Critic instance to use for evaluation
final_output_file: Name of the final output file
"""
logger.info(f"Aggregating results from {max_attempts} attempts")
logger.info(f"Aggregating results from {n_critic_runs} attempts")

# Dictionary to store the best result for each instance
best_results: dict[EvalInstanceID, EvalOutput] = {}

# Work backwards from the last attempt to the first
for attempt in range(max_attempts, 0, -1):
for attempt in range(n_critic_runs, 0, -1):
attempt_file = os.path.join(
output_dir, f"output.critic_attempt_{attempt}.jsonl"
)
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ class EvalMetadata(BaseModel):
eval_limit: int = Field(
default=0, description="Number of instances to evaluate, 0 means all"
)
max_attempts: int = Field(
default=1, ge=1, description="Maximum number of attempts for iterative mode"
n_critic_runs: int = Field(
default=1, ge=1, description="Number of critic evaluation runs for iterative mode"
)
critic: CriticBase = Field(
description=(
Expand Down
10 changes: 5 additions & 5 deletions tests/test_aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_prefers_non_error_over_error_when_last_attempt_errors(
f.write(output_3.model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl contains the instance (not dropped)
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down Expand Up @@ -135,7 +135,7 @@ def test_prefers_critic_success_over_non_error_critic_fail(self, temp_output_dir
f.write(output.model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl contains the instance
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down Expand Up @@ -185,7 +185,7 @@ def test_multiple_instances_with_mixed_results(self, temp_output_dir):
f.write(create_output("instance_3", error=None).model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify all instances appear in output.jsonl
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down Expand Up @@ -214,7 +214,7 @@ def test_all_attempts_error_instance_dropped(self, temp_output_dir):
f.write(output.model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl is empty (instance dropped because all attempts errored)
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand All @@ -228,7 +228,7 @@ def test_empty_attempts(self, temp_output_dir):
critic = PassCritic()

# Run aggregation with no attempt files
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl is created but empty
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down
8 changes: 4 additions & 4 deletions tests/test_iterative_resume.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def test_iterative_resume_with_expanded_n_limit():
Test that iterative evaluation correctly handles resume when n-limit is expanded.

Scenario:
1. First run: Process 50 instances with max_attempts=3
2. Second run: Expand to 200 instances with max_attempts=3
1. First run: Process 50 instances with n_critic_runs=3
2. Second run: Expand to 200 instances with n_critic_runs=3

Expected behavior:
- The 150 new instances (51-200) should be processed starting from attempt 1
Expand Down Expand Up @@ -109,7 +109,7 @@ def test_iterative_resume_with_expanded_n_limit():
eval_output_dir=tmpdir,
details={},
eval_limit=200,
max_attempts=3,
n_critic_runs=3,
max_retries=0,
critic=PassCritic(),
)
Expand Down Expand Up @@ -191,7 +191,7 @@ def test_iterative_resume_with_same_n_limit():
eval_output_dir=tmpdir,
details={},
eval_limit=50,
max_attempts=3,
n_critic_runs=3,
max_retries=0,
critic=PassCritic(),
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_keyboard_interrupt.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def evaluate_instance(
eval_output_dir="{tmpdir}",
details={{}},
eval_limit=0,
max_attempts=1,
n_critic_runs=1,
max_retries=0,
critic=PassCritic(),
)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_workspace_cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_workspace_cleanup_called_on_success():
eval_output_dir="/tmp/test",
details={},
eval_limit=1,
max_attempts=1,
n_critic_runs=1,
max_retries=0,
critic=PassCritic(),
)
Expand Down Expand Up @@ -87,7 +87,7 @@ def test_workspace_cleanup_called_on_failure():
eval_output_dir="/tmp/test",
details={},
eval_limit=1,
max_attempts=1,
n_critic_runs=1,
max_retries=0,
critic=PassCritic(),
)
Expand Down Expand Up @@ -147,7 +147,7 @@ def test_workspace_cleanup_handles_cleanup_exception():
eval_output_dir="/tmp/test",
details={},
eval_limit=1,
max_attempts=1,
n_critic_runs=1,
max_retries=0,
critic=PassCritic(),
)
Expand Down Expand Up @@ -203,7 +203,7 @@ def create_mock_workspace():
eval_output_dir="/tmp/test",
details={},
eval_limit=1,
max_attempts=1,
n_critic_runs=1,
max_retries=2,
critic=PassCritic(),
)
Expand Down
Loading