diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index 1ca70c1b..f48587ae 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -34,6 +34,26 @@ on: description: 'Software Agent SDK commit/ref to use' required: true type: string + build-eval-env: + description: 'Also build prebaked SWT-bench eval env images (default: false)' + required: false + default: 'false' + type: string + eval-image-prefix: + description: 'Registry prefix for prebaked eval images' + required: false + default: 'ghcr.io/openhands/swtbench-eval' + type: string + max-retries: + description: 'Retries per batch for eval env builds' + required: false + default: '2' + type: string + build-batch-size: + description: 'Env images per batch for eval env builds' + required: false + default: '10' + type: string concurrency: group: build-swt-bench-${{ github.ref }} @@ -158,6 +178,64 @@ jobs: DOCKER_BUILDKIT: 1 BUILDKIT_PROGRESS: plain + - name: Build prebaked eval env images + if: ${{ inputs.build-eval-env == 'true' }} + run: | + set -euo pipefail + + echo "Starting prebaked eval env image build at $(date -u)" + echo "Runner: $(uname -a)" + df -h + docker system df || true + docker info || true + + DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}" + SPLIT="${{ inputs.split || 'test' }}" + N_LIMIT="${{ inputs.n-limit || '0' }}" + INSTANCE_IDS="${{ inputs.instance-ids }}" + IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}" + MAX_WORKERS="${{ inputs.max-workers || '4' }}" + BUILD_MODE="${{ inputs.build-mode || 'cli' }}" + MAX_RETRIES="${{ inputs.max-retries || '2' }}" + BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}" + + echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV" + echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV" + echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV" + + # Basic BuildKit disk guard similar to SWE-bench + if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then + LINE=$(tail -n1 /tmp/buildkit_df) + TOTAL=$(echo "$LINE" | awk '{print $2}') + USED=$(echo "$LINE" | awk '{print $3}') + FREE=$(echo "$LINE" | awk '{print $4}') + if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then + PCT=$(( 100 * USED / TOTAL )) + echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes" + else + echo "Warning: unable to parse df output for /var/lib/buildkit" + fi + else + echo "Warning: /var/lib/buildkit not found; skipping disk check" + fi + + ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}" --max-retries "${MAX_RETRIES}" --build-batch-size "${BUILD_BATCH_SIZE}") + if [ -n "${INSTANCE_IDS}" ]; then + ARGS+=(--instance-ids "${INSTANCE_IDS}") + else + ARGS+=(--eval-limit "${N_LIMIT}") + fi + + echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}" + DOCKER_BUILDKIT=1 \ + BUILDKIT_PROGRESS=plain \ + BUILDKIT_RESET_ON_FAILURE=1 \ + PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log + + echo "Completed prebaked eval env image build at $(date -u)" + docker ps -a || true + docker system df || true + - name: Archive build logs if: always() run: | @@ -199,6 +277,23 @@ jobs: echo "**Successful:** $SUCCESS_COUNT ✅" >> "$GITHUB_STEP_SUMMARY" echo "**Failed:** $FAIL_COUNT ❌" >> "$GITHUB_STEP_SUMMARY" + - name: Make prebaked eval image package public (best-effort) + if: ${{ inputs.build-eval-env == 'true' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_PREFIX: ${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }} + run: | + set -euo pipefail + NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}') + if [ -z "$NAME" ]; then + echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update" + exit 0 + fi + gh api -X PATCH \ + -H "Accept: application/vnd.github+json" \ + /orgs/OpenHands/packages/container/${NAME}/visibility \ + -f visibility=public || echo "Warning: failed to set package visibility" + - name: Comment on tracker issue if: success() run: | diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py new file mode 100644 index 00000000..079ad66c --- /dev/null +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -0,0 +1,360 @@ +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Iterable, Iterator, List, Sequence + +import docker + +from benchmarks.swtbench.image_utils import ensure_swt_bench_repo +from benchmarks.utils.dataset import get_dataset +from benchmarks.utils.image_utils import image_exists as remote_image_exists +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def select_instance_ids( + dataset: str, + split: str, + eval_limit: int | None, + selected_instances_file: str | None, + instance_ids: list[str] | None, +) -> list[str]: + """ + Select the instance IDs that match the inference sampling logic. + """ + if instance_ids: + return instance_ids + + df = get_dataset( + dataset_name=dataset, + split=split, + eval_limit=eval_limit, + selected_instances_file=selected_instances_file, + ) + ids = df["instance_id"].tolist() + if not ids: + raise RuntimeError("No instances selected for image build.") + logger.info("Selected %s instances for image build", len(ids)) + return ids + + +def load_exec_specs( + swt_bench_dir: Path, + dataset: str, + split: str, + instance_ids: Iterable[str], + filter_swt: bool = True, +) -> list: + """ + Load ExecSpec objects for the provided instance IDs. + """ + sys.path.insert(0, str(swt_bench_dir / "src")) + sys.path.insert(0, str(swt_bench_dir)) + from src.dataset import load_swebench_dataset # type: ignore[import-not-found] + from src.exec_spec import make_exec_spec # type: ignore[import-not-found] + + cwd = os.getcwd() + try: + os.chdir(swt_bench_dir) + dataset_entries = load_swebench_dataset( + name=dataset, split=split, is_swt=False, filter_swt=filter_swt + ) + finally: + os.chdir(cwd) + by_id = {entry["instance_id"]: entry for entry in dataset_entries} + + specs = [] + missing = [] + for iid in instance_ids: + if iid not in by_id: + missing.append(iid) + continue + specs.append(make_exec_spec(by_id[iid])) + + if missing: + logger.warning( + "Skipped %s missing instance_ids not found in dataset: %s", + len(missing), + ", ".join(missing[:5]), + ) + if not specs: + raise RuntimeError("No ExecSpecs available after filtering instance IDs.") + return specs + + +def build_env_images( + exec_specs: list, + max_workers: int, + build_mode: str, + max_retries: int, + batch_size: int, + image_prefix: str | None, +) -> None: + """ + Build base + environment images required by the provided ExecSpecs. + + Images are pushed immediately after each successful build when image_prefix is set, + so partial progress is kept if the workflow fails mid-run. + """ + from src.docker_build import ( # type: ignore[import-not-found] + BuildImageError, + build_base_images, + build_env_images as build_envs, + ) + + client = docker.from_env() + total_base = len({spec.base_image_key for spec in exec_specs}) + total_env = len({spec.env_image_key for spec in exec_specs}) + remote_prefix = image_prefix.rstrip("/") if image_prefix else None + + base_to_build_keys: set[str] = set() + + def prefixed(tag: str) -> str | None: + return f"{remote_prefix}/{tag}" if remote_prefix else None + + base_spec_by_key = {} + for spec in exec_specs: + key = spec.base_image_key + base_spec_by_key.setdefault(key, spec) + remote_tag = prefixed(key) + + if remote_tag and remote_image_exists(remote_tag): + logger.info("Base image %s already in registry; reusing", remote_tag) + try: + img = client.images.pull(remote_tag) + if remote_tag != key: + img.tag(key) + except Exception as exc: # pragma: no cover - best effort + logger.warning( + "Failed to pull %s (%s); will rebuild locally", remote_tag, exc + ) + base_to_build_keys.add(key) + continue + continue + + base_to_build_keys.add(key) + + missing_base_specs = [base_spec_by_key[k] for k in base_to_build_keys] + skipped_base = total_base - len(base_to_build_keys) + + if missing_base_specs: + logger.info( + "Building %s/%s base images (skipping %s already present)", + len({spec.base_image_key for spec in missing_base_specs}), + total_base, + skipped_base, + ) + build_base_images( + client, missing_base_specs, force_rebuild=False, build_mode=build_mode + ) + base_built = {spec.base_image_key for spec in missing_base_specs} + if image_prefix: + tag_and_push(base_built, image_prefix) + else: + logger.info( + "All %s base images already exist; skipping base builds", total_base + ) + + missing_env_specs: list = [] + + for spec in exec_specs: + key = spec.env_image_key + remote_tag = prefixed(key) + + if remote_tag and remote_image_exists(remote_tag): + logger.info("Env image %s already in registry; skipping build", remote_tag) + continue + + missing_env_specs.append(spec) + + if not missing_env_specs: + logger.info("All %s env images already exist; skipping env builds", total_env) + return + + batches = list(chunked(missing_env_specs, max(1, batch_size))) + logger.info( + "Building %s/%s env images in %s batches (batch_size=%s)", + len({spec.env_image_key for spec in missing_env_specs}), + total_env, + len(batches), + batch_size, + ) + for idx, batch in enumerate(batches, start=1): + attempt = 0 + while True: + try: + logger.info( + "Batch %s/%s: building %s env images", idx, len(batches), len(batch) + ) + build_envs( + client, + batch, + force_rebuild=False, + max_workers=max_workers, + build_mode=build_mode, + ) + if image_prefix: + tag_and_push({spec.env_image_key for spec in batch}, image_prefix) + break + except BuildImageError as exc: + attempt += 1 + if attempt > max_retries: + logger.error( + "Batch %s/%s failed after %s attempts: %s", + idx, + len(batches), + max_retries, + exc, + ) + raise + logger.warning( + "Batch %s/%s failed (attempt %s/%s): %s; retrying", + idx, + len(batches), + attempt, + max_retries, + exc, + ) + return + + +def chunked(seq: Sequence, size: int) -> Iterator[List]: + for i in range(0, len(seq), size): + yield list(seq[i : i + size]) + + +def tag_and_push(images: Iterable[str], prefix: str) -> list[str]: + """ + Tag the provided images with the registry prefix and push them. + """ + pushed: list[str] = [] + prefix = prefix.rstrip("/") + for image in images: + target = f"{prefix}/{image}" + logger.info("Pushing %s -> %s", image, target) + subprocess_run(["docker", "tag", image, target]) + subprocess_run(["docker", "push", target]) + pushed.append(target) + return pushed + + +def subprocess_run(cmd: list[str]) -> None: + import subprocess + + result = subprocess.run(cmd, text=True, capture_output=True) + if result.returncode != 0: + logger.error("Command failed (%s): %s", " ".join(cmd), result.stderr) + raise RuntimeError(f"Command failed: {' '.join(cmd)}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Build and push prebaked SWT-bench eval env images." + ) + parser.add_argument("--dataset", required=True, help="Dataset name") + parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument( + "--eval-limit", + type=int, + default=1, + help="Match inference sampling by limiting instances (0 to disable)", + ) + parser.add_argument( + "--instance-ids", + default="", + help="Comma-separated instance IDs to force (overrides eval-limit)", + ) + parser.add_argument( + "--selected-instances-file", + default="", + help="Optional selected instances file used during inference", + ) + parser.add_argument( + "--image-prefix", + default="ghcr.io/openhands/swtbench-eval", + help="Registry prefix for pushed images", + ) + parser.add_argument( + "--max-workers", + type=int, + default=4, + help="Parallel builds for env images", + ) + parser.add_argument( + "--max-retries", + type=int, + default=2, + help="Retries per batch for env image builds", + ) + parser.add_argument( + "--build-batch-size", + type=int, + default=10, + help="Number of env images to build per batch", + ) + parser.add_argument( + "--build-mode", + choices=["api", "cli"], + default="cli", + help="swt-bench build mode", + ) + parser.add_argument( + "--no-push", + action="store_true", + help="Build images locally without pushing to the registry", + ) + args = parser.parse_args() + + instance_ids = ( + [iid for iid in args.instance_ids.split(",") if iid] + if args.instance_ids + else None + ) + eval_limit = None if instance_ids else args.eval_limit + selected_file = args.selected_instances_file or None + + swt_bench_dir = ensure_swt_bench_repo() + + target_ids = select_instance_ids( + dataset=args.dataset, + split=args.split, + eval_limit=eval_limit, + selected_instances_file=selected_file, + instance_ids=instance_ids, + ) + exec_specs = load_exec_specs( + swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True + ) + build_env_images( + exec_specs, + max_workers=args.max_workers, + build_mode=args.build_mode, + max_retries=args.max_retries, + batch_size=args.build_batch_size, + image_prefix=None if args.no_push else args.image_prefix, + ) + + base_images = {spec.base_image_key for spec in exec_specs} + env_images = {spec.env_image_key for spec in exec_specs} + logger.info("Built images: %s base, %s env", len(base_images), len(env_images)) + + manifest = { + "dataset": args.dataset, + "split": args.split, + "instances": target_ids, + "base_images": sorted(base_images), + "env_images": sorted(env_images), + "image_prefix": args.image_prefix, + "arch": "host", + } + print(json.dumps(manifest, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 7501bf23..9dc7062b 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,6 +18,10 @@ from pathlib import Path from time import monotonic +from benchmarks.swtbench.image_utils import ( + compute_required_images, + ensure_swt_bench_repo, +) from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -26,56 +30,7 @@ logger = get_logger(__name__) - -def patch_swt_bench_for_micromamba(swt_bench_dir: Path) -> None: - """ - Ensure the cached swt-bench checkout uses micromamba for env creation. - Applies small, idempotent text replacements to the upstream sources. - """ - solver_timeout_s = 600 - dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py" - exec_spec_path = swt_bench_dir / "src" / "exec_spec.py" - - if not dockerfiles_path.exists() or not exec_spec_path.exists(): - logger.warning( - "swt-bench sources missing expected files; skipping micromamba patch " - f"(dockerfiles: {dockerfiles_path.exists()}, exec_spec: {exec_spec_path.exists()})" - ) - return - - dockerfiles_text = dockerfiles_path.read_text() - dockerfiles_updated = dockerfiles_text.replace( - "RUN conda config --append channels conda-forge\n\nRUN adduser", - "RUN conda config --append channels conda-forge\n" - "# Use micromamba for faster solver performance during env builds\n" - "RUN conda install -n base -c conda-forge -y micromamba \\\n" - " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n" - "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n" - "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n" - "RUN adduser", - ) - - exec_spec_text = exec_spec_path.read_text() - replacements = { - "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ", - "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ", - "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file", - "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f", - "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=", - } - for old, new in replacements.items(): - exec_spec_text = exec_spec_text.replace(old, new) - - if dockerfiles_text != dockerfiles_updated: - dockerfiles_path.write_text(dockerfiles_updated) - logger.info("Patched swt-bench Dockerfile template to install micromamba.") - if exec_spec_path.read_text() != exec_spec_text: - exec_spec_path.write_text(exec_spec_text) - logger.info( - "Patched swt-bench exec_spec to create/update envs with micromamba " - "and a %ss timeout on solver calls.", - solver_timeout_s, - ) +PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval" def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: @@ -109,6 +64,60 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: return instance_ids +def try_pull_prebaked_images( + predictions_file: Path, + dataset: str, + split: str = "test", + registry: str = PREBAKED_REGISTRY, +) -> None: + """ + Best-effort pull of prebaked base/env images; no-op on failure. + """ + try: + base_images, env_images = compute_required_images( + predictions_file, + dataset, + split, + ) + except Exception as exc: # pragma: no cover - defensive + logger.warning("Skipping prebaked image pull (compute failed): %s", exc) + return + + tags = sorted(base_images | env_images) + if not tags: + logger.info("No prebaked images to pull (empty tag set)") + return + + registry = registry.rstrip("/") + for tag in tags: + remote = f"{registry}/{tag}" + logger.info("Attempting to pull prebaked image %s", remote) + try: + pull = subprocess.run( + ["docker", "pull", remote], + capture_output=True, + text=True, + ) + except FileNotFoundError: + logger.warning("Docker not available; skipping prebaked image pull") + return + + if pull.returncode != 0: + logger.warning("Failed to pull %s: %s", remote, pull.stderr.strip()) + continue + + # Tag the remote image with the local name expected by the harness. + tag_res = subprocess.run( + ["docker", "tag", remote, tag], + capture_output=True, + text=True, + ) + if tag_res.returncode != 0: + logger.warning("Failed to tag %s as %s: %s", remote, tag, tag_res.stderr) + else: + logger.info("Pulled and tagged %s -> %s", remote, tag) + + def update_report_with_submitted_instances( report_path: Path, predictions_path: Path ) -> None: @@ -242,32 +251,12 @@ def run_swtbench_evaluation( dataset: SWT-Bench dataset to evaluate against workers: Number of workers to use for evaluation """ - logger.info(f"Running SWT-Bench evaluation on {predictions_file}") + use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes") + mode = "legacy-conda" if use_legacy else "prebaked-images" + logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode) try: - # Use a global cache directory for SWT-Bench source - cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench" - swt_bench_dir = cache_dir / "swt-bench" - - # Clone SWT-Bench repository if it doesn't exist - if not swt_bench_dir.exists(): - logger.info("Setting up SWT-Bench source in global cache...") - cache_dir.mkdir(parents=True, exist_ok=True) - - logger.info("Cloning SWT-Bench repository...") - clone_cmd = [ - "git", - "clone", - "https://github.com/logic-star-ai/swt-bench.git", - str(swt_bench_dir), - ] - result = subprocess.run(clone_cmd, text=True) - if result.returncode != 0: - raise subprocess.CalledProcessError(result.returncode, clone_cmd) - - logger.info(f"SWT-Bench source installed at {swt_bench_dir}") - - patch_swt_bench_for_micromamba(swt_bench_dir) + swt_bench_dir = ensure_swt_bench_repo() # Get the directory and filename of the predictions file predictions_path = Path(predictions_file).resolve() @@ -424,6 +413,23 @@ def main() -> None: # Convert format convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) + # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. + use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( + "1", + "true", + "yes", + ) + if use_prebaked: + try_pull_prebaked_images( + output_file, + args.dataset, + ) + else: + logger.info( + "SWTBENCH_FORCE_CONDA set; skipping prebaked image pull " + "and using legacy (pre-mamba) evaluation flow" + ) + if not args.skip_evaluation: eval_phase_start = monotonic() # Run evaluation diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py new file mode 100644 index 00000000..e7aae1f4 --- /dev/null +++ b/benchmarks/swtbench/image_utils.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import json +import logging +import subprocess +import sys +from pathlib import Path +from typing import Iterable + +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: + """ + Ensure the SWT-bench sources are available locally. + + Returns the repository path under the cache directory. + """ + cache_dir = cache_dir or Path.home() / ".cache" / "openhands" / "swt-bench" + swt_bench_dir = cache_dir / "swt-bench" + + if swt_bench_dir.exists(): + return swt_bench_dir + + cache_dir.mkdir(parents=True, exist_ok=True) + logger.info("Cloning SWT-Bench repository into %s", swt_bench_dir) + result = subprocess.run( + [ + "git", + "clone", + "https://github.com/logic-star-ai/swt-bench.git", + str(swt_bench_dir), + ], + text=True, + capture_output=True, + ) + if result.returncode != 0: + logger.error("Failed to clone swt-bench: %s", result.stderr) + raise RuntimeError("Unable to clone swt-bench repository") + + return swt_bench_dir + + +def _load_instance_ids(output_jsonl: Path) -> list[str]: + instance_ids: list[str] = [] + seen = set() + with output_jsonl.open("r", encoding="utf-8") as infile: + for line_num, line in enumerate(infile, 1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError: + logger.debug("Skipping invalid JSON on line %s", line_num) + continue + instance_id = data.get("instance_id") + if not instance_id or instance_id in seen: + continue + seen.add(instance_id) + instance_ids.append(instance_id) + return instance_ids + + +def compute_required_images( + output_jsonl: Path, + dataset: str, + split: str, +) -> tuple[set[str], set[str]]: + """ + Compute the base/env image tags required to evaluate the given predictions file. + + Returns (base_image_tags, env_image_tags). + """ + instance_ids = _load_instance_ids(output_jsonl) + if not instance_ids: + raise ValueError(f"No instance_ids found in {output_jsonl}") + + swt_bench_dir = ensure_swt_bench_repo() + sys.path.insert(0, str(swt_bench_dir / "src")) + sys.path.insert(0, str(swt_bench_dir)) + + # Delay import until after sys.path manipulation so we use the cached checkout. + from src.dataset import load_swebench_dataset # type: ignore[import-not-found] + from src.exec_spec import make_exec_spec # type: ignore[import-not-found] + + dataset_entries = load_swebench_dataset( + name=dataset, split=split, is_swt=True, filter_swt=True + ) + entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries} + + missing = [iid for iid in instance_ids if iid not in entries_by_id] + if missing: + logger.warning( + "Predictions reference %s instance_ids not present in dataset: %s", + len(missing), + ", ".join(missing[:5]), + ) + + specs = [ + make_exec_spec(entries_by_id[iid]) + for iid in instance_ids + if iid in entries_by_id + ] + if not specs: + raise RuntimeError("No ExecSpecs produced; cannot compute required images.") + + base_images = {spec.base_image_key for spec in specs} + env_images = {spec.env_image_key for spec in specs} + logger.info( + "Computed %s base images and %s env images for %s instances", + len(base_images), + len(env_images), + len(specs), + ) + return base_images, env_images + + +def format_images_plain(images: Iterable[str]) -> str: + return "\n".join(sorted(images)) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="List SWT-bench base/env images required for a predictions file." + ) + parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") + parser.add_argument("--dataset", required=True, help="Dataset name") + parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument( + "--format", + choices=["plain", "json"], + default="plain", + help="Output format", + ) + args = parser.parse_args() + + base_images, env_images = compute_required_images( + args.output_jsonl, + args.dataset, + args.split, + ) + payload = { + "base": sorted(base_images), + "env": sorted(env_images), + } + + if args.format == "json": + print(json.dumps(payload)) + else: + print(format_images_plain(payload["base"] + payload["env"])) + + +if __name__ == "__main__": + # Configure root logging for ad-hoc usage + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + main() diff --git a/pyproject.toml b/pyproject.toml index 8c842312..f65dfaf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ swebench-infer = "benchmarks.swebench.run_infer:main" swtbench-infer = "benchmarks.swtbench.run_infer:main" swebench-eval = "benchmarks.swebench.eval_infer:main" swtbench-eval = "benchmarks.swtbench.eval_infer:main" +swtbench-list-images = "benchmarks.swtbench.image_utils:main" +swtbench-build-eval-images = "benchmarks.swtbench.build_eval_env_images:main" gaia-infer = "benchmarks.gaia.run_infer:main" gaia-eval = "benchmarks.gaia.eval_infer:main" commit0-infer = "benchmarks.commit0.run_infer:main" diff --git a/uv.lock b/uv.lock index e7351742..7b04cc65 100644 --- a/uv.lock +++ b/uv.lock @@ -947,11 +947,11 @@ wheels = [ [[package]] name = "filelock" -version = "3.19.1" +version = "3.20.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, + { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" }, ] [[package]] @@ -1678,11 +1678,11 @@ wheels = [ [[package]] name = "libtmux" -version = "0.46.2" +version = "0.53.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9c/aa/7e1dcaa097156d6f3a7d8669be4389dced997feeb81744e3ff4681d65ee8/libtmux-0.46.2.tar.gz", hash = "sha256:9a398fec5d714129c8344555d466e1a903dfc0f741ba07aabe75a8ceb25c5dda", size = 346887, upload-time = "2025-05-26T19:40:04.096Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/28/e2b252817cb181aec2f42fe2d1d7fac5ec9c4d15bfb2b8ea4bd1179e4244/libtmux-0.53.0.tar.gz", hash = "sha256:1d19af4cea0c19543954d7e7317c7025c0739b029cccbe3b843212fae238f1bd", size = 405001, upload-time = "2025-12-14T11:59:11.337Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/2f/9d207039fcfa00d3b30e4d765f062fbcc42c873c7518a8cfebb3eafd00e0/libtmux-0.46.2-py3-none-any.whl", hash = "sha256:6c32dbf22bde8e5e33b2714a4295f6e838dc640f337cd4c085a044f6828c7793", size = 60873, upload-time = "2025-05-26T19:40:02.284Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d0/2e8bc5caa639ebb9f8801ba0be7070a28d48d8ed60e2a428d40f71fb88b8/libtmux-0.53.0-py3-none-any.whl", hash = "sha256:024b7ae6a12aae55358e8feb914c8632b3ab9bd61c0987c53559643c6a58ee4f", size = 77582, upload-time = "2025-12-14T11:59:09.739Z" }, ] [[package]] @@ -2269,7 +2269,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2407,11 +2407,12 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "deprecation" }, { name = "fastmcp" }, + { name = "filelock" }, { name = "httpx" }, { name = "litellm" }, { name = "lmnr" }, @@ -2432,10 +2433,11 @@ requires-dist = [ { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" }, { name = "deprecation", specifier = ">=2.1.0" }, { name = "fastmcp", specifier = ">=2.11.3" }, + { name = "filelock", specifier = ">=3.20.1" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.80.10" }, { name = "lmnr", specifier = ">=0.7.24" }, - { name = "pydantic", specifier = ">=2.11.7" }, + { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "tenacity", specifier = ">=9.1.2" }, @@ -2445,7 +2447,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2466,7 +2468,7 @@ requires-dist = [ { name = "browser-use", specifier = ">=0.8.0" }, { name = "cachetools" }, { name = "func-timeout", specifier = ">=4.3.5" }, - { name = "libtmux", specifier = ">=0.46.2" }, + { name = "libtmux", specifier = ">=0.53.0" }, { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "tom-swe", specifier = ">=1.0.3" }, @@ -2474,7 +2476,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" },