From b6ed901db87356b15cb73795181b856acd9ecd75 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 14 Jan 2026 23:33:43 +0100 Subject: [PATCH 01/32] Add prebaked SWT-bench eval image build support Co-authored-by: openhands --- .../workflows/build-swtbench-eval-images.yml | 116 +++++++++ benchmarks/swtbench/build_eval_env_images.py | 230 ++++++++++++++++++ benchmarks/swtbench/eval_infer.py | 27 +- benchmarks/swtbench/image_utils.py | 224 +++++++++++++++++ pyproject.toml | 2 + uv.lock | 77 +++--- 6 files changed, 618 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/build-swtbench-eval-images.yml create mode 100644 benchmarks/swtbench/build_eval_env_images.py create mode 100644 benchmarks/swtbench/image_utils.py diff --git a/.github/workflows/build-swtbench-eval-images.yml b/.github/workflows/build-swtbench-eval-images.yml new file mode 100644 index 00000000..d669777d --- /dev/null +++ b/.github/workflows/build-swtbench-eval-images.yml @@ -0,0 +1,116 @@ +name: Build SWT-Bench Eval Images + +on: + workflow_dispatch: + inputs: + dataset: + description: "Dataset name" + required: true + default: "eth-sri/SWT-bench_Verified_bm25_27k_zsp" + type: string + split: + description: "Dataset split" + required: true + default: "test" + type: string + eval-limit: + description: "Number of instances to match inference sampling (0 to disable)" + required: false + default: "1" + type: string + instance-ids: + description: "Comma-separated instance IDs to force (overrides eval-limit)" + required: false + default: "" + type: string + image-prefix: + description: "Registry prefix for pushed images" + required: false + default: "ghcr.io/openhands/swtbench-eval" + type: string + max-workers: + description: "Maximum parallel env builds" + required: false + default: "4" + type: string + build-mode: + description: "swt-bench build mode" + required: false + default: "api" + type: choice + options: + - api + - cli + +concurrency: + group: build-swtbench-eval-${{ github.ref }} + cancel-in-progress: false + +jobs: + build: + runs-on: + labels: blacksmith-32vcpu-ubuntu-2204 + permissions: + contents: read + packages: write + actions: read + + steps: + - name: Checkout repository + uses: actions/checkout@v6 + with: + submodules: recursive + + - name: Set up Docker Buildx + uses: useblacksmith/setup-docker-builder@v1 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: true + + - name: Install dependencies + run: make build + + - name: Build and push prebaked eval env images + env: + DATASET: ${{ inputs.dataset }} + SPLIT: ${{ inputs.split }} + EVAL_LIMIT: ${{ inputs.eval-limit }} + INSTANCE_IDS: ${{ inputs.instance-ids }} + IMAGE_PREFIX: ${{ inputs.image-prefix }} + MAX_WORKERS: ${{ inputs.max-workers }} + BUILD_MODE: ${{ inputs.build-mode }} + run: | + set -euo pipefail + ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}") + if [ -n "${INSTANCE_IDS}" ]; then + ARGS+=(--instance-ids "${INSTANCE_IDS}") + else + ARGS+=(--eval-limit "${EVAL_LIMIT}") + fi + uv run swtbench-build-eval-images "${ARGS[@]}" + + - name: Make image package public (best-effort) + if: github.repository_owner == 'OpenHands' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_PREFIX: ${{ inputs.image-prefix }} + run: | + set -euo pipefail + NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}') + if [ -z "$NAME" ]; then + echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update" + exit 0 + fi + gh api -X PATCH \ + -H "Accept: application/vnd.github+json" \ + /user/packages/container/${NAME}/visibility \ + -f visibility=public || echo "Warning: failed to set package visibility" diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py new file mode 100644 index 00000000..4652a2bb --- /dev/null +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Iterable + +import docker + +from benchmarks.swtbench.image_utils import ( + ensure_swt_bench_repo, + patch_swt_bench_for_micromamba, +) +from benchmarks.utils.dataset import get_dataset +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def select_instance_ids( + dataset: str, + split: str, + eval_limit: int | None, + selected_instances_file: str | None, + instance_ids: list[str] | None, +) -> list[str]: + """ + Select the instance IDs that match the inference sampling logic. + """ + if instance_ids: + return instance_ids + + df = get_dataset( + dataset_name=dataset, + split=split, + eval_limit=eval_limit, + selected_instances_file=selected_instances_file, + ) + ids = df["instance_id"].tolist() + if not ids: + raise RuntimeError("No instances selected for image build.") + logger.info("Selected %s instances for image build", len(ids)) + return ids + + +def load_exec_specs( + swt_bench_dir: Path, + dataset: str, + split: str, + instance_ids: Iterable[str], + filter_swt: bool = True, +) -> list: + """ + Load ExecSpec objects for the provided instance IDs. + """ + sys.path.insert(0, str(swt_bench_dir / "src")) + sys.path.insert(0, str(swt_bench_dir)) + from src.dataset import load_swebench_dataset # type: ignore[import-not-found] + from src.exec_spec import make_exec_spec # type: ignore[import-not-found] + + dataset_entries = load_swebench_dataset( + name=dataset, split=split, is_swt=False, filter_swt=filter_swt + ) + by_id = {entry["instance_id"]: entry for entry in dataset_entries} + + specs = [] + missing = [] + for iid in instance_ids: + if iid not in by_id: + missing.append(iid) + continue + specs.append(make_exec_spec(by_id[iid])) + + if missing: + logger.warning( + "Skipped %s missing instance_ids not found in dataset: %s", + len(missing), + ", ".join(missing[:5]), + ) + if not specs: + raise RuntimeError("No ExecSpecs available after filtering instance IDs.") + return specs + + +def build_env_images(exec_specs: list, max_workers: int, build_mode: str) -> None: + """ + Build base + environment images required by the provided ExecSpecs. + """ + from src.docker_build import ( # type: ignore[import-not-found] + build_base_images, + build_env_images as build_envs, + ) + + client = docker.from_env() + logger.info( + "Building %s base images and %s env images (mode=%s, workers=%s)", + len({spec.base_image_key for spec in exec_specs}), + len({spec.env_image_key for spec in exec_specs}), + build_mode, + max_workers, + ) + build_base_images(client, exec_specs, force_rebuild=False, build_mode=build_mode) + build_envs( + client, + exec_specs, + force_rebuild=False, + max_workers=max_workers, + build_mode=build_mode, + ) + + +def tag_and_push(images: Iterable[str], prefix: str) -> list[str]: + """ + Tag the provided images with the registry prefix and push them. + """ + pushed: list[str] = [] + prefix = prefix.rstrip("/") + for image in images: + target = f"{prefix}/{image}" + logger.info("Pushing %s -> %s", image, target) + subprocess_run(["docker", "tag", image, target]) + subprocess_run(["docker", "push", target]) + pushed.append(target) + return pushed + + +def subprocess_run(cmd: list[str]) -> None: + import subprocess + + result = subprocess.run(cmd, text=True, capture_output=True) + if result.returncode != 0: + logger.error("Command failed (%s): %s", " ".join(cmd), result.stderr) + raise RuntimeError(f"Command failed: {' '.join(cmd)}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Build and push prebaked SWT-bench eval env images." + ) + parser.add_argument("--dataset", required=True, help="Dataset name") + parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument( + "--eval-limit", + type=int, + default=1, + help="Match inference sampling by limiting instances (0 to disable)", + ) + parser.add_argument( + "--instance-ids", + default="", + help="Comma-separated instance IDs to force (overrides eval-limit)", + ) + parser.add_argument( + "--selected-instances-file", + default="", + help="Optional selected instances file used during inference", + ) + parser.add_argument( + "--image-prefix", + default="ghcr.io/openhands/swtbench-eval", + help="Registry prefix for pushed images", + ) + parser.add_argument( + "--max-workers", + type=int, + default=4, + help="Parallel builds for env images", + ) + parser.add_argument( + "--build-mode", + choices=["api", "cli"], + default="api", + help="swt-bench build mode", + ) + parser.add_argument( + "--no-push", + action="store_true", + help="Build images locally without pushing to the registry", + ) + args = parser.parse_args() + + instance_ids = ( + [iid for iid in args.instance_ids.split(",") if iid] + if args.instance_ids + else None + ) + eval_limit = None if instance_ids else args.eval_limit + selected_file = args.selected_instances_file or None + + swt_bench_dir = ensure_swt_bench_repo() + patch_swt_bench_for_micromamba(swt_bench_dir) + + target_ids = select_instance_ids( + dataset=args.dataset, + split=args.split, + eval_limit=eval_limit, + selected_instances_file=selected_file, + instance_ids=instance_ids, + ) + exec_specs = load_exec_specs( + swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True + ) + + build_env_images( + exec_specs, max_workers=args.max_workers, build_mode=args.build_mode + ) + + base_images = {spec.base_image_key for spec in exec_specs} + env_images = {spec.env_image_key for spec in exec_specs} + logger.info("Built images: %s base, %s env", len(base_images), len(env_images)) + + if not args.no_push: + pushed = tag_and_push(base_images | env_images, args.image_prefix) + logger.info("Pushed %s images", len(pushed)) + + manifest = { + "dataset": args.dataset, + "split": args.split, + "instances": target_ids, + "base_images": sorted(base_images), + "env_images": sorted(env_images), + "image_prefix": args.image_prefix, + } + print(json.dumps(manifest, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 94cb120a..4be058fb 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -17,6 +17,10 @@ import sys from pathlib import Path +from benchmarks.swtbench.image_utils import ( + ensure_swt_bench_repo, + patch_swt_bench_for_micromamba, +) from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -193,27 +197,8 @@ def run_swtbench_evaluation( logger.info(f"Running SWT-Bench evaluation on {predictions_file}") try: - # Use a global cache directory for SWT-Bench source - cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench" - swt_bench_dir = cache_dir / "swt-bench" - - # Clone SWT-Bench repository if it doesn't exist - if not swt_bench_dir.exists(): - logger.info("Setting up SWT-Bench source in global cache...") - cache_dir.mkdir(parents=True, exist_ok=True) - - logger.info("Cloning SWT-Bench repository...") - clone_cmd = [ - "git", - "clone", - "https://github.com/logic-star-ai/swt-bench.git", - str(swt_bench_dir), - ] - result = subprocess.run(clone_cmd, text=True) - if result.returncode != 0: - raise subprocess.CalledProcessError(result.returncode, clone_cmd) - - logger.info(f"SWT-Bench source installed at {swt_bench_dir}") + swt_bench_dir = ensure_swt_bench_repo() + patch_swt_bench_for_micromamba(swt_bench_dir) # Get the directory and filename of the predictions file predictions_path = Path(predictions_file).resolve() diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py new file mode 100644 index 00000000..f855b272 --- /dev/null +++ b/benchmarks/swtbench/image_utils.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +import json +import logging +import subprocess +import sys +from pathlib import Path +from typing import Iterable + +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: + """ + Ensure the SWT-bench sources are available locally. + + Returns the repository path under the cache directory. + """ + cache_dir = cache_dir or Path.home() / ".cache" / "openhands" / "swt-bench" + swt_bench_dir = cache_dir / "swt-bench" + + if swt_bench_dir.exists(): + return swt_bench_dir + + cache_dir.mkdir(parents=True, exist_ok=True) + logger.info("Cloning SWT-Bench repository into %s", swt_bench_dir) + result = subprocess.run( + [ + "git", + "clone", + "https://github.com/logic-star-ai/swt-bench.git", + str(swt_bench_dir), + ], + text=True, + capture_output=True, + ) + if result.returncode != 0: + logger.error("Failed to clone swt-bench: %s", result.stderr) + raise RuntimeError("Unable to clone swt-bench repository") + + return swt_bench_dir + + +def patch_swt_bench_for_micromamba( + swt_bench_dir: Path, solver_timeout_s: int = 300 +) -> None: + """ + Patch the cached swt-bench checkout to use micromamba with timeouts when + building environments. Idempotent: safe to call multiple times. + """ + dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py" + exec_spec_path = swt_bench_dir / "src" / "exec_spec.py" + + if not dockerfiles_path.exists() or not exec_spec_path.exists(): + logger.warning( + "swt-bench sources missing expected files; skipping micromamba patch " + "(dockerfiles: %s, exec_spec: %s)", + dockerfiles_path.exists(), + exec_spec_path.exists(), + ) + return + + dockerfiles_text = dockerfiles_path.read_text() + dockerfiles_updated = dockerfiles_text.replace( + "RUN conda config --append channels conda-forge\n\nRUN adduser", + "RUN conda config --append channels conda-forge\n" + "# Use micromamba for faster solver performance during env builds\n" + "RUN conda install -n base -c conda-forge -y micromamba \\\n" + " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n" + "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n" + "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n" + "RUN adduser", + ) + + exec_spec_text = exec_spec_path.read_text() + replacements = { + "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ", + "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ", + "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file", + "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f", + "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=", + } + for old, new in replacements.items(): + exec_spec_text = exec_spec_text.replace(old, new) + + if dockerfiles_text != dockerfiles_updated: + dockerfiles_path.write_text(dockerfiles_updated) + logger.info("Patched swt-bench Dockerfile template to install micromamba.") + if exec_spec_path.read_text() != exec_spec_text: + exec_spec_path.write_text(exec_spec_text) + logger.info( + "Patched swt-bench exec_spec to use micromamba with a %ss timeout.", + solver_timeout_s, + ) + + +def _load_instance_ids(output_jsonl: Path) -> list[str]: + instance_ids: list[str] = [] + seen = set() + with output_jsonl.open("r", encoding="utf-8") as infile: + for line_num, line in enumerate(infile, 1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError: + logger.debug("Skipping invalid JSON on line %s", line_num) + continue + instance_id = data.get("instance_id") + if not instance_id or instance_id in seen: + continue + seen.add(instance_id) + instance_ids.append(instance_id) + return instance_ids + + +def compute_required_images( + output_jsonl: Path, + dataset: str, + split: str, + *, + filter_swt: bool = True, + is_swt: bool = False, +) -> tuple[set[str], set[str]]: + """ + Compute the base/env image tags required to evaluate the given predictions file. + + Returns (base_image_tags, env_image_tags). + """ + instance_ids = _load_instance_ids(output_jsonl) + if not instance_ids: + raise ValueError(f"No instance_ids found in {output_jsonl}") + + swt_bench_dir = ensure_swt_bench_repo() + sys.path.insert(0, str(swt_bench_dir / "src")) + sys.path.insert(0, str(swt_bench_dir)) + + # Delay import until after sys.path manipulation so we use the cached checkout. + from src.dataset import load_swebench_dataset # type: ignore[import-not-found] + from src.exec_spec import make_exec_spec # type: ignore[import-not-found] + + dataset_entries = load_swebench_dataset( + name=dataset, split=split, is_swt=is_swt, filter_swt=filter_swt + ) + entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries} + + missing = [iid for iid in instance_ids if iid not in entries_by_id] + if missing: + logger.warning( + "Predictions reference %s instance_ids not present in dataset: %s", + len(missing), + ", ".join(missing[:5]), + ) + + specs = [ + make_exec_spec(entries_by_id[iid]) + for iid in instance_ids + if iid in entries_by_id + ] + if not specs: + raise RuntimeError("No ExecSpecs produced; cannot compute required images.") + + base_images = {spec.base_image_key for spec in specs} + env_images = {spec.env_image_key for spec in specs} + logger.info( + "Computed %s base images and %s env images for %s instances", + len(base_images), + len(env_images), + len(specs), + ) + return base_images, env_images + + +def format_images_plain(images: Iterable[str]) -> str: + return "\n".join(sorted(images)) + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + description="List SWT-bench base/env images required for a predictions file." + ) + parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") + parser.add_argument("--dataset", required=True, help="Dataset name") + parser.add_argument("--split", default="test", help="Dataset split") + parser.add_argument( + "--no-filter-swt", + action="store_true", + help="Disable SWT filtering when loading the dataset", + ) + parser.add_argument( + "--format", + choices=["plain", "json"], + default="plain", + help="Output format", + ) + args = parser.parse_args() + + base_images, env_images = compute_required_images( + args.output_jsonl, + args.dataset, + args.split, + filter_swt=not args.no_filter_swt, + ) + payload = { + "base": sorted(base_images), + "env": sorted(env_images), + } + + if args.format == "json": + print(json.dumps(payload)) + else: + print(format_images_plain(payload["base"] + payload["env"])) + + +if __name__ == "__main__": + # Configure root logging for ad-hoc usage + logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") + main() diff --git a/pyproject.toml b/pyproject.toml index 0ecd0736..a3bf7b10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,8 @@ swebench-infer = "benchmarks.swebench.run_infer:main" swtbench-infer = "benchmarks.swtbench.run_infer:main" swebench-eval = "benchmarks.swebench.eval_infer:main" swtbench-eval = "benchmarks.swtbench.eval_infer:main" +swtbench-list-images = "benchmarks.swtbench.image_utils:main" +swtbench-build-eval-images = "benchmarks.swtbench.build_eval_env_images:main" gaia-infer = "benchmarks.gaia.run_infer:main" gaia-eval = "benchmarks.gaia.eval_infer:main" commit0-infer = "benchmarks.commit0.run_infer:main" diff --git a/uv.lock b/uv.lock index 9639461b..f8c7cb1e 100644 --- a/uv.lock +++ b/uv.lock @@ -947,11 +947,11 @@ wheels = [ [[package]] name = "filelock" -version = "3.19.1" +version = "3.20.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, + { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" }, ] [[package]] @@ -1678,11 +1678,11 @@ wheels = [ [[package]] name = "libtmux" -version = "0.46.2" +version = "0.53.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9c/aa/7e1dcaa097156d6f3a7d8669be4389dced997feeb81744e3ff4681d65ee8/libtmux-0.46.2.tar.gz", hash = "sha256:9a398fec5d714129c8344555d466e1a903dfc0f741ba07aabe75a8ceb25c5dda", size = 346887, upload-time = "2025-05-26T19:40:04.096Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/28/e2b252817cb181aec2f42fe2d1d7fac5ec9c4d15bfb2b8ea4bd1179e4244/libtmux-0.53.0.tar.gz", hash = "sha256:1d19af4cea0c19543954d7e7317c7025c0739b029cccbe3b843212fae238f1bd", size = 405001, upload-time = "2025-12-14T11:59:11.337Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/2f/9d207039fcfa00d3b30e4d765f062fbcc42c873c7518a8cfebb3eafd00e0/libtmux-0.46.2-py3-none-any.whl", hash = "sha256:6c32dbf22bde8e5e33b2714a4295f6e838dc640f337cd4c085a044f6828c7793", size = 60873, upload-time = "2025-05-26T19:40:02.284Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d0/2e8bc5caa639ebb9f8801ba0be7070a28d48d8ed60e2a428d40f71fb88b8/libtmux-0.53.0-py3-none-any.whl", hash = "sha256:024b7ae6a12aae55358e8feb914c8632b3ab9bd61c0987c53559643c6a58ee4f", size = 77582, upload-time = "2025-12-14T11:59:09.739Z" }, ] [[package]] @@ -2269,7 +2269,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2405,11 +2405,12 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "deprecation" }, { name = "fastmcp" }, + { name = "filelock" }, { name = "httpx" }, { name = "litellm" }, { name = "lmnr" }, @@ -2430,10 +2431,11 @@ requires-dist = [ { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" }, { name = "deprecation", specifier = ">=2.1.0" }, { name = "fastmcp", specifier = ">=2.11.3" }, + { name = "filelock", specifier = ">=3.20.1" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.80.10" }, { name = "lmnr", specifier = ">=0.7.24" }, - { name = "pydantic", specifier = ">=2.11.7" }, + { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "tenacity", specifier = ">=9.1.2" }, @@ -2443,7 +2445,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2464,7 +2466,7 @@ requires-dist = [ { name = "browser-use", specifier = ">=0.8.0" }, { name = "cachetools" }, { name = "func-timeout", specifier = ">=4.3.5" }, - { name = "libtmux", specifier = ">=0.46.2" }, + { name = "libtmux", specifier = ">=0.53.0" }, { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "tom-swe", specifier = ">=1.0.3" }, @@ -2472,7 +2474,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" }, @@ -3138,21 +3140,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, - { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, - { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, - { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, - { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, - { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, - { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, - { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, - { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, - { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, - { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, - { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, - { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, - { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, + { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, @@ -3161,16 +3149,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, - { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:05.804Z" }, - { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, - { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:09.827Z" }, - { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:12.379Z" }, - { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:14.627Z" }, - { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:16.868Z" }, - { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, - { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, - { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, - { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, + { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, + { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, + { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, + { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, + { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, + { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, + { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, + { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, + { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, + { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, + { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, From f00bc1eb0e1975a3f1fed34687b597b290fb8d60 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 14 Jan 2026 23:37:17 +0100 Subject: [PATCH 02/32] Add arch override for SWT-bench eval image builds Co-authored-by: openhands --- benchmarks/swtbench/build_eval_env_images.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 4652a2bb..b6c07d0e 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -162,6 +162,12 @@ def main() -> None: default="ghcr.io/openhands/swtbench-eval", help="Registry prefix for pushed images", ) + parser.add_argument( + "--arch", + choices=["x86_64", "arm64", ""], + default="", + help="Force architecture for built images (defaults to host arch)", + ) parser.add_argument( "--max-workers", type=int, @@ -202,6 +208,10 @@ def main() -> None: exec_specs = load_exec_specs( swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True ) + if args.arch: + for spec in exec_specs: + spec.arch = args.arch + logger.info("Overrode ExecSpec architecture to %s", args.arch) build_env_images( exec_specs, max_workers=args.max_workers, build_mode=args.build_mode @@ -222,6 +232,7 @@ def main() -> None: "base_images": sorted(base_images), "env_images": sorted(env_images), "image_prefix": args.image_prefix, + "arch": args.arch or "host", } print(json.dumps(manifest, indent=2)) From ee1b5a683825bc07af369cb4bd699c4b7f387710 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 14 Jan 2026 23:39:00 +0100 Subject: [PATCH 03/32] Let swtbench workflow build prebaked eval env images Co-authored-by: openhands --- .github/workflows/build-swtbench-images.yml | 45 +++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index 1ca70c1b..e819178d 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -34,6 +34,24 @@ on: description: 'Software Agent SDK commit/ref to use' required: true type: string + build-eval-env: + description: 'Also build prebaked SWT-bench eval env images (default: false)' + required: false + default: 'false' + type: string + eval-image-prefix: + description: 'Registry prefix for prebaked eval images' + required: false + default: 'ghcr.io/openhands/swtbench-eval' + type: string + eval-arch: + description: 'Architecture for prebaked eval images' + required: false + default: 'x86_64' + type: choice + options: + - x86_64 + - arm64 concurrency: group: build-swt-bench-${{ github.ref }} @@ -158,6 +176,33 @@ jobs: DOCKER_BUILDKIT: 1 BUILDKIT_PROGRESS: plain + - name: Build prebaked eval env images + if: ${{ inputs.build-eval-env == 'true' }} + run: | + set -euo pipefail + + DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}" + SPLIT="${{ inputs.split || 'test' }}" + N_LIMIT="${{ inputs.n-limit || '0' }}" + INSTANCE_IDS="${{ inputs.instance-ids }}" + IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}" + EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}" + MAX_WORKERS="${{ inputs.max-workers || '4' }}" + BUILD_MODE="${{ inputs.build-mode || 'api' }}" + + ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}") + if [ -n "${INSTANCE_IDS}" ]; then + ARGS+=(--instance-ids "${INSTANCE_IDS}") + else + ARGS+=(--eval-limit "${N_LIMIT}") + fi + if [ -n "${EVAL_ARCH}" ]; then + ARGS+=(--arch "${EVAL_ARCH}") + fi + + echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}" + uv run swtbench-build-eval-images "${ARGS[@]}" + - name: Archive build logs if: always() run: | From c0d1432620e2c388c0392f4a8d9e6fc8819ec2c1 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 14 Jan 2026 23:44:31 +0100 Subject: [PATCH 04/32] Expose prebaked eval images publicly in swtbench workflow Co-authored-by: openhands --- .github/workflows/build-swtbench-images.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index e819178d..7f887b2b 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -244,6 +244,23 @@ jobs: echo "**Successful:** $SUCCESS_COUNT ✅" >> "$GITHUB_STEP_SUMMARY" echo "**Failed:** $FAIL_COUNT ❌" >> "$GITHUB_STEP_SUMMARY" + - name: Make prebaked eval image package public (best-effort) + if: ${{ inputs.build-eval-env == 'true' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + IMAGE_PREFIX: ${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }} + run: | + set -euo pipefail + NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}') + if [ -z "$NAME" ]; then + echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update" + exit 0 + fi + gh api -X PATCH \ + -H "Accept: application/vnd.github+json" \ + /orgs/OpenHands/packages/container/${NAME}/visibility \ + -f visibility=public || echo "Warning: failed to set package visibility" + - name: Comment on tracker issue if: success() run: | From ce669ad0a5552a70a06707e644bd4864ef73dd5b Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 00:10:57 +0100 Subject: [PATCH 05/32] Make micromamba patch optional for eval image builds --- benchmarks/swtbench/build_eval_env_images.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index b6c07d0e..aa269b61 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -8,10 +8,7 @@ import docker -from benchmarks.swtbench.image_utils import ( - ensure_swt_bench_repo, - patch_swt_bench_for_micromamba, -) +from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset from openhands.sdk import get_logger @@ -185,6 +182,12 @@ def main() -> None: action="store_true", help="Build images locally without pushing to the registry", ) + parser.add_argument( + "--use-micromamba", + action="store_true", + help="Patch swt-bench to use micromamba when building images " + "(changes env hash; off by default)", + ) args = parser.parse_args() instance_ids = ( @@ -196,7 +199,10 @@ def main() -> None: selected_file = args.selected_instances_file or None swt_bench_dir = ensure_swt_bench_repo() - patch_swt_bench_for_micromamba(swt_bench_dir) + if args.use_micromamba: + from benchmarks.swtbench.image_utils import patch_swt_bench_for_micromamba + + patch_swt_bench_for_micromamba(swt_bench_dir) target_ids = select_instance_ids( dataset=args.dataset, From e523d95cc073a36749c1eef0fa11ca42bd68a7de Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 09:50:19 +0100 Subject: [PATCH 06/32] Drop micromamba fallback in eval image build --- benchmarks/swtbench/build_eval_env_images.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index aa269b61..58dc6e4e 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -182,12 +182,6 @@ def main() -> None: action="store_true", help="Build images locally without pushing to the registry", ) - parser.add_argument( - "--use-micromamba", - action="store_true", - help="Patch swt-bench to use micromamba when building images " - "(changes env hash; off by default)", - ) args = parser.parse_args() instance_ids = ( @@ -199,10 +193,6 @@ def main() -> None: selected_file = args.selected_instances_file or None swt_bench_dir = ensure_swt_bench_repo() - if args.use_micromamba: - from benchmarks.swtbench.image_utils import patch_swt_bench_for_micromamba - - patch_swt_bench_for_micromamba(swt_bench_dir) target_ids = select_instance_ids( dataset=args.dataset, From 655d2ce4d61fc20a82d8ec84eab087922747d3f8 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 09:53:55 +0100 Subject: [PATCH 07/32] Remove redundant eval-only workflow --- .../workflows/build-swtbench-eval-images.yml | 116 ------------------ 1 file changed, 116 deletions(-) delete mode 100644 .github/workflows/build-swtbench-eval-images.yml diff --git a/.github/workflows/build-swtbench-eval-images.yml b/.github/workflows/build-swtbench-eval-images.yml deleted file mode 100644 index d669777d..00000000 --- a/.github/workflows/build-swtbench-eval-images.yml +++ /dev/null @@ -1,116 +0,0 @@ -name: Build SWT-Bench Eval Images - -on: - workflow_dispatch: - inputs: - dataset: - description: "Dataset name" - required: true - default: "eth-sri/SWT-bench_Verified_bm25_27k_zsp" - type: string - split: - description: "Dataset split" - required: true - default: "test" - type: string - eval-limit: - description: "Number of instances to match inference sampling (0 to disable)" - required: false - default: "1" - type: string - instance-ids: - description: "Comma-separated instance IDs to force (overrides eval-limit)" - required: false - default: "" - type: string - image-prefix: - description: "Registry prefix for pushed images" - required: false - default: "ghcr.io/openhands/swtbench-eval" - type: string - max-workers: - description: "Maximum parallel env builds" - required: false - default: "4" - type: string - build-mode: - description: "swt-bench build mode" - required: false - default: "api" - type: choice - options: - - api - - cli - -concurrency: - group: build-swtbench-eval-${{ github.ref }} - cancel-in-progress: false - -jobs: - build: - runs-on: - labels: blacksmith-32vcpu-ubuntu-2204 - permissions: - contents: read - packages: write - actions: read - - steps: - - name: Checkout repository - uses: actions/checkout@v6 - with: - submodules: recursive - - - name: Set up Docker Buildx - uses: useblacksmith/setup-docker-builder@v1 - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Install uv - uses: astral-sh/setup-uv@v7 - with: - enable-cache: true - - - name: Install dependencies - run: make build - - - name: Build and push prebaked eval env images - env: - DATASET: ${{ inputs.dataset }} - SPLIT: ${{ inputs.split }} - EVAL_LIMIT: ${{ inputs.eval-limit }} - INSTANCE_IDS: ${{ inputs.instance-ids }} - IMAGE_PREFIX: ${{ inputs.image-prefix }} - MAX_WORKERS: ${{ inputs.max-workers }} - BUILD_MODE: ${{ inputs.build-mode }} - run: | - set -euo pipefail - ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}") - if [ -n "${INSTANCE_IDS}" ]; then - ARGS+=(--instance-ids "${INSTANCE_IDS}") - else - ARGS+=(--eval-limit "${EVAL_LIMIT}") - fi - uv run swtbench-build-eval-images "${ARGS[@]}" - - - name: Make image package public (best-effort) - if: github.repository_owner == 'OpenHands' - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - IMAGE_PREFIX: ${{ inputs.image-prefix }} - run: | - set -euo pipefail - NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}') - if [ -z "$NAME" ]; then - echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update" - exit 0 - fi - gh api -X PATCH \ - -H "Accept: application/vnd.github+json" \ - /user/packages/container/${NAME}/visibility \ - -f visibility=public || echo "Warning: failed to set package visibility" From cfab9a960bc9675209f1557afdcf74cd8d110f21 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 09:56:54 +0100 Subject: [PATCH 08/32] Add verbose diagnostics around eval image build step --- .github/workflows/build-swtbench-images.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index 7f887b2b..ca07d32a 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -181,6 +181,12 @@ jobs: run: | set -euo pipefail + echo "Starting prebaked eval env image build at $(date -u)" + echo "Runner: $(uname -a)" + df -h + docker system df || true + docker info || true + DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}" SPLIT="${{ inputs.split || 'test' }}" N_LIMIT="${{ inputs.n-limit || '0' }}" @@ -201,7 +207,11 @@ jobs: fi echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}" - uv run swtbench-build-eval-images "${ARGS[@]}" + PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log + + echo "Completed prebaked eval env image build at $(date -u)" + docker ps -a || true + docker system df || true - name: Archive build logs if: always() From 154a1592ab47e793908fb3abfed6cce21e9dadce Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 10:34:08 +0100 Subject: [PATCH 09/32] Use buildx/cli path for eval images and add runner diagnostics --- .github/workflows/build-swtbench-images.yml | 28 +++++++++++++++++++- benchmarks/swtbench/build_eval_env_images.py | 2 +- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index ca07d32a..c1a524ab 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -194,7 +194,29 @@ jobs: IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}" EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}" MAX_WORKERS="${{ inputs.max-workers || '4' }}" - BUILD_MODE="${{ inputs.build-mode || 'api' }}" + BUILD_MODE="${{ inputs.build-mode || 'cli' }}" + # Map to docker platform string + if [ "${EVAL_ARCH}" = "x86_64" ]; then + DOCKER_PLATFORM="linux/amd64" + else + DOCKER_PLATFORM="linux/${EVAL_ARCH}" + fi + + # Basic BuildKit disk guard similar to SWE-bench + if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then + LINE=$(tail -n1 /tmp/buildkit_df) + TOTAL=$(echo "$LINE" | awk '{print $2}') + USED=$(echo "$LINE" | awk '{print $3}') + FREE=$(echo "$LINE" | awk '{print $4}') + if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then + PCT=$(( 100 * USED / TOTAL )) + echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes" + else + echo "Warning: unable to parse df output for /var/lib/buildkit" + fi + else + echo "Warning: /var/lib/buildkit not found; skipping disk check" + fi ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}") if [ -n "${INSTANCE_IDS}" ]; then @@ -207,6 +229,10 @@ jobs: fi echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}" + DOCKER_DEFAULT_PLATFORM="${DOCKER_PLATFORM}" \ + DOCKER_BUILDKIT=1 \ + BUILDKIT_PROGRESS=plain \ + BUILDKIT_RESET_ON_FAILURE=1 \ PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log echo "Completed prebaked eval env image build at $(date -u)" diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 58dc6e4e..c7040e7b 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -174,7 +174,7 @@ def main() -> None: parser.add_argument( "--build-mode", choices=["api", "cli"], - default="api", + default="cli", help="swt-bench build mode", ) parser.add_argument( From 6380c419cdcfcfbde5521612256f4f9493cd0deb Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 10:43:35 +0100 Subject: [PATCH 10/32] Add batching/retries and buildx settings to SWT-bench eval builds --- .github/workflows/build-swtbench-images.yml | 18 ++++- benchmarks/swtbench/build_eval_env_images.py | 77 +++++++++++++++++--- 2 files changed, 85 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index c1a524ab..ec72a6bb 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -52,6 +52,16 @@ on: options: - x86_64 - arm64 + max-retries: + description: 'Retries per batch for eval env builds' + required: false + default: '2' + type: string + build-batch-size: + description: 'Env images per batch for eval env builds' + required: false + default: '10' + type: string concurrency: group: build-swt-bench-${{ github.ref }} @@ -195,6 +205,12 @@ jobs: EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}" MAX_WORKERS="${{ inputs.max-workers || '4' }}" BUILD_MODE="${{ inputs.build-mode || 'cli' }}" + MAX_RETRIES="${{ inputs.max-retries || '2' }}" + BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}" + + echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV" + echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV" + echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV" # Map to docker platform string if [ "${EVAL_ARCH}" = "x86_64" ]; then DOCKER_PLATFORM="linux/amd64" @@ -218,7 +234,7 @@ jobs: echo "Warning: /var/lib/buildkit not found; skipping disk check" fi - ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}") + ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}" --max-retries "${MAX_RETRIES}" --build-batch-size "${BUILD_BATCH_SIZE}") if [ -n "${INSTANCE_IDS}" ]; then ARGS+=(--instance-ids "${INSTANCE_IDS}") else diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index c7040e7b..02ff07fb 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -4,7 +4,7 @@ import json import sys from pathlib import Path -from typing import Iterable +from typing import Iterable, Iterator, List, Sequence import docker @@ -81,11 +81,18 @@ def load_exec_specs( return specs -def build_env_images(exec_specs: list, max_workers: int, build_mode: str) -> None: +def build_env_images( + exec_specs: list, + max_workers: int, + build_mode: str, + max_retries: int, + batch_size: int, +) -> None: """ Build base + environment images required by the provided ExecSpecs. """ from src.docker_build import ( # type: ignore[import-not-found] + BuildImageError, build_base_images, build_env_images as build_envs, ) @@ -99,13 +106,49 @@ def build_env_images(exec_specs: list, max_workers: int, build_mode: str) -> Non max_workers, ) build_base_images(client, exec_specs, force_rebuild=False, build_mode=build_mode) - build_envs( - client, - exec_specs, - force_rebuild=False, - max_workers=max_workers, - build_mode=build_mode, + batches = list(chunked(exec_specs, max(1, batch_size))) + logger.info( + "Building env images in %s batches (batch_size=%s)", len(batches), batch_size ) + for idx, batch in enumerate(batches, start=1): + attempt = 0 + while True: + try: + logger.info( + "Batch %s/%s: building %s env images", idx, len(batches), len(batch) + ) + build_envs( + client, + batch, + force_rebuild=False, + max_workers=max_workers, + build_mode=build_mode, + ) + break + except BuildImageError as exc: + attempt += 1 + if attempt > max_retries: + logger.error( + "Batch %s/%s failed after %s attempts: %s", + idx, + len(batches), + max_retries, + exc, + ) + raise + logger.warning( + "Batch %s/%s failed (attempt %s/%s): %s; retrying", + idx, + len(batches), + attempt, + max_retries, + exc, + ) + + +def chunked(seq: Sequence, size: int) -> Iterator[List]: + for i in range(0, len(seq), size): + yield list(seq[i : i + size]) def tag_and_push(images: Iterable[str], prefix: str) -> list[str]: @@ -171,6 +214,18 @@ def main() -> None: default=4, help="Parallel builds for env images", ) + parser.add_argument( + "--max-retries", + type=int, + default=2, + help="Retries per batch for env image builds", + ) + parser.add_argument( + "--build-batch-size", + type=int, + default=10, + help="Number of env images to build per batch", + ) parser.add_argument( "--build-mode", choices=["api", "cli"], @@ -210,7 +265,11 @@ def main() -> None: logger.info("Overrode ExecSpec architecture to %s", args.arch) build_env_images( - exec_specs, max_workers=args.max_workers, build_mode=args.build_mode + exec_specs, + max_workers=args.max_workers, + build_mode=args.build_mode, + max_retries=args.max_retries, + batch_size=args.build_batch_size, ) base_images = {spec.base_image_key for spec in exec_specs} From 328ef48eb8f4c747f50f1add57fc08a025d7b055 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 10:50:41 +0100 Subject: [PATCH 11/32] Remove local platform override; keep buildx batching/retries --- .github/workflows/build-swtbench-images.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index ec72a6bb..c375f519 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -211,12 +211,6 @@ jobs: echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV" echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV" echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV" - # Map to docker platform string - if [ "${EVAL_ARCH}" = "x86_64" ]; then - DOCKER_PLATFORM="linux/amd64" - else - DOCKER_PLATFORM="linux/${EVAL_ARCH}" - fi # Basic BuildKit disk guard similar to SWE-bench if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then @@ -245,7 +239,6 @@ jobs: fi echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}" - DOCKER_DEFAULT_PLATFORM="${DOCKER_PLATFORM}" \ DOCKER_BUILDKIT=1 \ BUILDKIT_PROGRESS=plain \ BUILDKIT_RESET_ON_FAILURE=1 \ From 510277069e7c30213d8add485fcf63bb54fef4d8 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 13:48:57 +0100 Subject: [PATCH 12/32] Skip rebuilding existing swtbench images --- benchmarks/swtbench/build_eval_env_images.py | 59 ++++++++++++++++---- 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 02ff07fb..aa96740b 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -7,6 +7,7 @@ from typing import Iterable, Iterator, List, Sequence import docker +from docker.errors import ImageNotFound from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset @@ -98,17 +99,47 @@ def build_env_images( ) client = docker.from_env() + total_base = len({spec.base_image_key for spec in exec_specs}) + total_env = len({spec.env_image_key for spec in exec_specs}) + + base_missing: dict[str, bool] = {} + for spec in exec_specs: + key = spec.base_image_key + if key not in base_missing: + base_missing[key] = not image_exists(client, key) + missing_base_specs = [spec for spec in exec_specs if base_missing[spec.base_image_key]] + skipped_base = total_base - len({spec.base_image_key for spec in missing_base_specs}) + + if missing_base_specs: + logger.info( + "Building %s/%s base images (skipping %s already present)", + len({spec.base_image_key for spec in missing_base_specs}), + total_base, + skipped_base, + ) + build_base_images( + client, missing_base_specs, force_rebuild=False, build_mode=build_mode + ) + else: + logger.info("All %s base images already exist; skipping base builds", total_base) + + env_missing: dict[str, bool] = {} + for spec in exec_specs: + key = spec.env_image_key + if key not in env_missing: + env_missing[key] = not image_exists(client, key) + missing_env_specs = [spec for spec in exec_specs if env_missing[spec.env_image_key]] + if not missing_env_specs: + logger.info("All %s env images already exist; skipping env builds", total_env) + return + + batches = list(chunked(missing_env_specs, max(1, batch_size))) logger.info( - "Building %s base images and %s env images (mode=%s, workers=%s)", - len({spec.base_image_key for spec in exec_specs}), - len({spec.env_image_key for spec in exec_specs}), - build_mode, - max_workers, - ) - build_base_images(client, exec_specs, force_rebuild=False, build_mode=build_mode) - batches = list(chunked(exec_specs, max(1, batch_size))) - logger.info( - "Building env images in %s batches (batch_size=%s)", len(batches), batch_size + "Building %s/%s env images in %s batches (batch_size=%s)", + len({spec.env_image_key for spec in missing_env_specs}), + total_env, + len(batches), + batch_size, ) for idx, batch in enumerate(batches, start=1): attempt = 0 @@ -151,6 +182,14 @@ def chunked(seq: Sequence, size: int) -> Iterator[List]: yield list(seq[i : i + size]) +def image_exists(client: docker.DockerClient, tag: str) -> bool: + try: + client.images.get(tag) + return True + except ImageNotFound: + return False + + def tag_and_push(images: Iterable[str], prefix: str) -> list[str]: """ Tag the provided images with the registry prefix and push them. From 9610efd369abfd9ca27ad686a992f921a25e5080 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 14:12:33 +0100 Subject: [PATCH 13/32] Reuse remote SWT-bench images and fix dataset cwd --- benchmarks/swtbench/build_eval_env_images.py | 108 ++++++++++++++----- 1 file changed, 83 insertions(+), 25 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index aa96740b..a8cc0e21 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -2,6 +2,7 @@ import argparse import json +import os import sys from pathlib import Path from typing import Iterable, Iterator, List, Sequence @@ -11,6 +12,7 @@ from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset +from benchmarks.utils.image_utils import image_exists as remote_image_exists from openhands.sdk import get_logger @@ -58,9 +60,14 @@ def load_exec_specs( from src.dataset import load_swebench_dataset # type: ignore[import-not-found] from src.exec_spec import make_exec_spec # type: ignore[import-not-found] - dataset_entries = load_swebench_dataset( - name=dataset, split=split, is_swt=False, filter_swt=filter_swt - ) + cwd = os.getcwd() + try: + os.chdir(swt_bench_dir) + dataset_entries = load_swebench_dataset( + name=dataset, split=split, is_swt=False, filter_swt=filter_swt + ) + finally: + os.chdir(cwd) by_id = {entry["instance_id"]: entry for entry in dataset_entries} specs = [] @@ -88,7 +95,8 @@ def build_env_images( build_mode: str, max_retries: int, batch_size: int, -) -> None: + image_prefix: str | None, +) -> tuple[set[str], set[str]]: """ Build base + environment images required by the provided ExecSpecs. """ @@ -101,14 +109,50 @@ def build_env_images( client = docker.from_env() total_base = len({spec.base_image_key for spec in exec_specs}) total_env = len({spec.env_image_key for spec in exec_specs}) + remote_prefix = image_prefix.rstrip("/") if image_prefix else None + + base_to_push: set[str] = set() + base_to_build_keys: set[str] = set() + + def prefixed(tag: str) -> str | None: + return f"{remote_prefix}/{tag}" if remote_prefix else None + + def ensure_local(tag: str) -> bool: + try: + client.images.get(tag) + return True + except ImageNotFound: + return False - base_missing: dict[str, bool] = {} + base_spec_by_key = {} for spec in exec_specs: key = spec.base_image_key - if key not in base_missing: - base_missing[key] = not image_exists(client, key) - missing_base_specs = [spec for spec in exec_specs if base_missing[spec.base_image_key]] - skipped_base = total_base - len({spec.base_image_key for spec in missing_base_specs}) + base_spec_by_key.setdefault(key, spec) + remote_tag = prefixed(key) + + if remote_tag and remote_image_exists(remote_tag): + logger.info("Base image %s already in registry; reusing", remote_tag) + if not ensure_local(key): + try: + img = client.images.pull(remote_tag) + if remote_tag != key: + img.tag(key) + except Exception as exc: # pragma: no cover - best effort + logger.warning( + "Failed to pull %s (%s); will rebuild locally", remote_tag, exc + ) + base_to_build_keys.add(key) + continue + continue + + if ensure_local(key): + base_to_push.add(key) + continue + + base_to_build_keys.add(key) + + missing_base_specs = [base_spec_by_key[k] for k in base_to_build_keys] + skipped_base = total_base - len(base_to_build_keys) if missing_base_specs: logger.info( @@ -120,18 +164,32 @@ def build_env_images( build_base_images( client, missing_base_specs, force_rebuild=False, build_mode=build_mode ) + base_built = {spec.base_image_key for spec in missing_base_specs} + base_to_push.update(base_built) else: logger.info("All %s base images already exist; skipping base builds", total_base) - env_missing: dict[str, bool] = {} + env_to_push: set[str] = set() + missing_env_specs: list = [] + for spec in exec_specs: key = spec.env_image_key - if key not in env_missing: - env_missing[key] = not image_exists(client, key) - missing_env_specs = [spec for spec in exec_specs if env_missing[spec.env_image_key]] + remote_tag = prefixed(key) + + if remote_tag and remote_image_exists(remote_tag): + logger.info("Env image %s already in registry; skipping build", remote_tag) + continue + + if ensure_local(key): + logger.info("Env image %s already present locally; reusing", key) + env_to_push.add(key) + continue + + missing_env_specs.append(spec) + if not missing_env_specs: logger.info("All %s env images already exist; skipping env builds", total_env) - return + return base_to_push, env_to_push batches = list(chunked(missing_env_specs, max(1, batch_size))) logger.info( @@ -175,6 +233,9 @@ def build_env_images( max_retries, exc, ) + env_to_push.update({spec.env_image_key for spec in missing_env_specs}) + + return base_to_push, env_to_push def chunked(seq: Sequence, size: int) -> Iterator[List]: @@ -182,14 +243,6 @@ def chunked(seq: Sequence, size: int) -> Iterator[List]: yield list(seq[i : i + size]) -def image_exists(client: docker.DockerClient, tag: str) -> bool: - try: - client.images.get(tag) - return True - except ImageNotFound: - return False - - def tag_and_push(images: Iterable[str], prefix: str) -> list[str]: """ Tag the provided images with the registry prefix and push them. @@ -303,12 +356,13 @@ def main() -> None: spec.arch = args.arch logger.info("Overrode ExecSpec architecture to %s", args.arch) - build_env_images( + base_to_push, env_to_push = build_env_images( exec_specs, max_workers=args.max_workers, build_mode=args.build_mode, max_retries=args.max_retries, batch_size=args.build_batch_size, + image_prefix=None if args.no_push else args.image_prefix, ) base_images = {spec.base_image_key for spec in exec_specs} @@ -316,8 +370,12 @@ def main() -> None: logger.info("Built images: %s base, %s env", len(base_images), len(env_images)) if not args.no_push: - pushed = tag_and_push(base_images | env_images, args.image_prefix) - logger.info("Pushed %s images", len(pushed)) + to_push = base_to_push | env_to_push + if to_push: + pushed = tag_and_push(to_push, args.image_prefix) + logger.info("Pushed %s images", len(pushed)) + else: + logger.info("No images need pushing; all present in registry") manifest = { "dataset": args.dataset, From 0e1ddce6ce3a97b45ccc654ca2fed8fad862f378 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 14:17:04 +0100 Subject: [PATCH 14/32] Only build/push when registry missing --- benchmarks/swtbench/build_eval_env_images.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index a8cc0e21..ac04d83b 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -145,10 +145,6 @@ def ensure_local(tag: str) -> bool: continue continue - if ensure_local(key): - base_to_push.add(key) - continue - base_to_build_keys.add(key) missing_base_specs = [base_spec_by_key[k] for k in base_to_build_keys] @@ -180,11 +176,6 @@ def ensure_local(tag: str) -> bool: logger.info("Env image %s already in registry; skipping build", remote_tag) continue - if ensure_local(key): - logger.info("Env image %s already present locally; reusing", key) - env_to_push.add(key) - continue - missing_env_specs.append(spec) if not missing_env_specs: From 7a0f1824f66aeced1e9fa0df7f44b3d865b64e25 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 14:18:16 +0100 Subject: [PATCH 15/32] Always pull remote base images; drop local fallback --- benchmarks/swtbench/build_eval_env_images.py | 29 +++++++------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index ac04d83b..66380c91 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -8,7 +8,6 @@ from typing import Iterable, Iterator, List, Sequence import docker -from docker.errors import ImageNotFound from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset @@ -117,13 +116,6 @@ def build_env_images( def prefixed(tag: str) -> str | None: return f"{remote_prefix}/{tag}" if remote_prefix else None - def ensure_local(tag: str) -> bool: - try: - client.images.get(tag) - return True - except ImageNotFound: - return False - base_spec_by_key = {} for spec in exec_specs: key = spec.base_image_key @@ -132,17 +124,16 @@ def ensure_local(tag: str) -> bool: if remote_tag and remote_image_exists(remote_tag): logger.info("Base image %s already in registry; reusing", remote_tag) - if not ensure_local(key): - try: - img = client.images.pull(remote_tag) - if remote_tag != key: - img.tag(key) - except Exception as exc: # pragma: no cover - best effort - logger.warning( - "Failed to pull %s (%s); will rebuild locally", remote_tag, exc - ) - base_to_build_keys.add(key) - continue + try: + img = client.images.pull(remote_tag) + if remote_tag != key: + img.tag(key) + except Exception as exc: # pragma: no cover - best effort + logger.warning( + "Failed to pull %s (%s); will rebuild locally", remote_tag, exc + ) + base_to_build_keys.add(key) + continue continue base_to_build_keys.add(key) From a32372954f8867422246b634ba7091f8f0f69e7b Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 15:38:45 +0100 Subject: [PATCH 16/32] Drop micromamba patch; fail if prebaked images missing --- benchmarks/swtbench/eval_infer.py | 6 +--- benchmarks/swtbench/image_utils.py | 53 ------------------------------ 2 files changed, 1 insertion(+), 58 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 4be058fb..b7b468b0 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -17,10 +17,7 @@ import sys from pathlib import Path -from benchmarks.swtbench.image_utils import ( - ensure_swt_bench_repo, - patch_swt_bench_for_micromamba, -) +from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -198,7 +195,6 @@ def run_swtbench_evaluation( try: swt_bench_dir = ensure_swt_bench_repo() - patch_swt_bench_for_micromamba(swt_bench_dir) # Get the directory and filename of the predictions file predictions_path = Path(predictions_file).resolve() diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index f855b272..b76ac5ec 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -44,59 +44,6 @@ def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: return swt_bench_dir -def patch_swt_bench_for_micromamba( - swt_bench_dir: Path, solver_timeout_s: int = 300 -) -> None: - """ - Patch the cached swt-bench checkout to use micromamba with timeouts when - building environments. Idempotent: safe to call multiple times. - """ - dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py" - exec_spec_path = swt_bench_dir / "src" / "exec_spec.py" - - if not dockerfiles_path.exists() or not exec_spec_path.exists(): - logger.warning( - "swt-bench sources missing expected files; skipping micromamba patch " - "(dockerfiles: %s, exec_spec: %s)", - dockerfiles_path.exists(), - exec_spec_path.exists(), - ) - return - - dockerfiles_text = dockerfiles_path.read_text() - dockerfiles_updated = dockerfiles_text.replace( - "RUN conda config --append channels conda-forge\n\nRUN adduser", - "RUN conda config --append channels conda-forge\n" - "# Use micromamba for faster solver performance during env builds\n" - "RUN conda install -n base -c conda-forge -y micromamba \\\n" - " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n" - "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n" - "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n" - "RUN adduser", - ) - - exec_spec_text = exec_spec_path.read_text() - replacements = { - "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ", - "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ", - "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file", - "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f", - "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=", - } - for old, new in replacements.items(): - exec_spec_text = exec_spec_text.replace(old, new) - - if dockerfiles_text != dockerfiles_updated: - dockerfiles_path.write_text(dockerfiles_updated) - logger.info("Patched swt-bench Dockerfile template to install micromamba.") - if exec_spec_path.read_text() != exec_spec_text: - exec_spec_path.write_text(exec_spec_text) - logger.info( - "Patched swt-bench exec_spec to use micromamba with a %ss timeout.", - solver_timeout_s, - ) - - def _load_instance_ids(output_jsonl: Path) -> list[str]: instance_ids: list[str] = [] seen = set() From 8ac449d22ef6e39e1a43d54cc758b0adbf64c261 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 15:58:24 +0100 Subject: [PATCH 17/32] Format with pre-commit --- benchmarks/swtbench/build_eval_env_images.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 66380c91..ffd684e9 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -154,7 +154,9 @@ def prefixed(tag: str) -> str | None: base_built = {spec.base_image_key for spec in missing_base_specs} base_to_push.update(base_built) else: - logger.info("All %s base images already exist; skipping base builds", total_base) + logger.info( + "All %s base images already exist; skipping base builds", total_base + ) env_to_push: set[str] = set() missing_env_specs: list = [] From 1f7248a59964e368f955078df2d44d41b4e789dc Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Thu, 15 Jan 2026 17:04:38 +0100 Subject: [PATCH 18/32] Push eval images as they are built --- benchmarks/swtbench/build_eval_env_images.py | 28 ++++++++------------ 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index ffd684e9..b147c112 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -95,9 +95,12 @@ def build_env_images( max_retries: int, batch_size: int, image_prefix: str | None, -) -> tuple[set[str], set[str]]: +) -> None: """ Build base + environment images required by the provided ExecSpecs. + + Images are pushed immediately after each successful build when image_prefix is set, + so partial progress is kept if the workflow fails mid-run. """ from src.docker_build import ( # type: ignore[import-not-found] BuildImageError, @@ -110,7 +113,6 @@ def build_env_images( total_env = len({spec.env_image_key for spec in exec_specs}) remote_prefix = image_prefix.rstrip("/") if image_prefix else None - base_to_push: set[str] = set() base_to_build_keys: set[str] = set() def prefixed(tag: str) -> str | None: @@ -152,13 +154,13 @@ def prefixed(tag: str) -> str | None: client, missing_base_specs, force_rebuild=False, build_mode=build_mode ) base_built = {spec.base_image_key for spec in missing_base_specs} - base_to_push.update(base_built) + if image_prefix: + tag_and_push(base_built, image_prefix) else: logger.info( "All %s base images already exist; skipping base builds", total_base ) - env_to_push: set[str] = set() missing_env_specs: list = [] for spec in exec_specs: @@ -173,7 +175,7 @@ def prefixed(tag: str) -> str | None: if not missing_env_specs: logger.info("All %s env images already exist; skipping env builds", total_env) - return base_to_push, env_to_push + return batches = list(chunked(missing_env_specs, max(1, batch_size))) logger.info( @@ -197,6 +199,8 @@ def prefixed(tag: str) -> str | None: max_workers=max_workers, build_mode=build_mode, ) + if image_prefix: + tag_and_push({spec.env_image_key for spec in batch}, image_prefix) break except BuildImageError as exc: attempt += 1 @@ -217,9 +221,7 @@ def prefixed(tag: str) -> str | None: max_retries, exc, ) - env_to_push.update({spec.env_image_key for spec in missing_env_specs}) - - return base_to_push, env_to_push + return def chunked(seq: Sequence, size: int) -> Iterator[List]: @@ -340,7 +342,7 @@ def main() -> None: spec.arch = args.arch logger.info("Overrode ExecSpec architecture to %s", args.arch) - base_to_push, env_to_push = build_env_images( + build_env_images( exec_specs, max_workers=args.max_workers, build_mode=args.build_mode, @@ -353,14 +355,6 @@ def main() -> None: env_images = {spec.env_image_key for spec in exec_specs} logger.info("Built images: %s base, %s env", len(base_images), len(env_images)) - if not args.no_push: - to_push = base_to_push | env_to_push - if to_push: - pushed = tag_and_push(to_push, args.image_prefix) - logger.info("Pushed %s images", len(pushed)) - else: - logger.info("No images need pushing; all present in registry") - manifest = { "dataset": args.dataset, "split": args.split, From d10418cf018c95f6688cef1ebc30618abd602529 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 16 Jan 2026 16:35:50 +0100 Subject: [PATCH 19/32] Fallback to micromamba when prebaked swtbench eval images missing Co-authored-by: openhands --- benchmarks/swtbench/eval_infer.py | 51 ++++++++++++++-- benchmarks/swtbench/image_utils.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 5 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index d98dbdde..1476131e 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,7 +18,10 @@ from pathlib import Path from time import monotonic -from benchmarks.swtbench.image_utils import ensure_swt_bench_repo +from benchmarks.swtbench.image_utils import ( + ensure_swt_bench_repo, + pull_prebaked_eval_images, +) from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -246,13 +249,51 @@ def run_swtbench_evaluation( logger.info(f"Running SWT-Bench evaluation on {predictions_file}") try: + predictions_path = Path(predictions_file).resolve() + predictions_filename = predictions_path.name + swt_bench_dir = ensure_swt_bench_repo() - patch_swt_bench_for_micromamba(swt_bench_dir) + prebaked_ok, prebaked_details = pull_prebaked_eval_images( + predictions_path, dataset, split=os.getenv("SWT_BENCH_SPLIT", "test") + ) + if prebaked_ok: + logger.info( + "Using prebaked SWT-Bench eval images from %s (%s pulled).", + prebaked_details.get("prefix"), + len(prebaked_details.get("pulled", [])), + ) + else: + missing = prebaked_details.get("missing", []) + sample_missing = ", ".join( + (m.get("remote") or m.get("tag", "")) + + (f" [{m.get('reason')}]" if m.get("reason") else "") + for m in missing[:5] + ) + logger.warning( + "Prebaked SWT-Bench eval images unavailable; falling back to micromamba builds. " + "prefix=%s dataset=%s split=%s required=%s missing=%s sample_missing=%s auth=%s detail=%s", + prebaked_details.get("prefix"), + dataset, + prebaked_details.get("split"), + prebaked_details.get("required_count"), + len(missing), + sample_missing or "n/a", + "yes" if prebaked_details.get("used_auth") else "no", + prebaked_details.get("error") or "missing images", + ) + pull_errors = prebaked_details.get("pull_errors") or [] + if pull_errors: + logger.info( + "Pull/tag issues (truncated): %s", + "; ".join( + f"{err.get('remote')}: {err.get('reason')}" + + (f" ({err.get('error')})" if err.get("error") else "") + for err in pull_errors[:3] + ), + ) - # Get the directory and filename of the predictions file - predictions_path = Path(predictions_file).resolve() - predictions_filename = predictions_path.name + patch_swt_bench_for_micromamba(swt_bench_dir) # Copy predictions file to swt-bench directory swt_predictions_file = swt_bench_dir / predictions_filename diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index b76ac5ec..76f7d6fc 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -2,15 +2,18 @@ import json import logging +import os import subprocess import sys from pathlib import Path from typing import Iterable +from benchmarks.utils.image_utils import image_exists from openhands.sdk import get_logger logger = get_logger(__name__) +DEFAULT_EVAL_IMAGE_PREFIX = "ghcr.io/openhands/swtbench-eval" def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: @@ -126,6 +129,100 @@ def format_images_plain(images: Iterable[str]) -> str: return "\n".join(sorted(images)) +def _run_docker(cmd: list[str]) -> tuple[bool, str]: + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + return False, (result.stderr or result.stdout or "").strip() + return True, (result.stdout or "").strip() + + +def pull_prebaked_eval_images( + predictions_file: Path, + dataset: str, + split: str, + *, + image_prefix: str | None = None, + gh_username: str | None = None, + gh_pat: str | None = None, +) -> tuple[bool, dict]: + """ + Attempt to pull prebaked SWT-bench eval base/env images from a registry. + + Returns (all_available, details_dict). + """ + prefix = ( + image_prefix + or os.getenv("SWT_BENCH_EVAL_IMAGE_PREFIX") + or DEFAULT_EVAL_IMAGE_PREFIX + ).rstrip("/") + details: dict = { + "prefix": prefix, + "dataset": dataset, + "split": split, + } + + if not prefix: + details["error"] = "empty_prefix" + return False, details + + try: + base_images, env_images = compute_required_images( + predictions_file, dataset, split + ) + except Exception as exc: # pragma: no cover - network/FS issues + details["error"] = f"compute_failed: {exc}" + return False, details + + required = sorted(base_images | env_images) + details["required_count"] = len(required) + if not required: + details["error"] = "no_required_images" + return False, details + + gh_user = gh_username or os.getenv("GHCR_USERNAME") or os.getenv("GITHUB_ACTOR") + gh_token = gh_pat or os.getenv("GHCR_PAT") + + missing: list[dict] = [] + pulled: list[str] = [] + pull_errors: list[dict] = [] + + for tag in required: + remote_tag = f"{prefix}/{tag}" + exists = image_exists(remote_tag, gh_username=gh_user, gh_pat=gh_token) + if not exists: + missing.append({"remote": remote_tag, "tag": tag, "reason": "not_found"}) + continue + + ok, err = _run_docker(["docker", "pull", remote_tag]) + if not ok: + pull_errors.append( + { + "remote": remote_tag, + "tag": tag, + "reason": "pull_failed", + "error": err, + } + ) + missing.append({"remote": remote_tag, "tag": tag, "reason": "pull_failed"}) + continue + + ok, err = _run_docker(["docker", "tag", remote_tag, tag]) + if not ok: + pull_errors.append( + {"remote": remote_tag, "tag": tag, "reason": "tag_failed", "error": err} + ) + missing.append({"remote": remote_tag, "tag": tag, "reason": "tag_failed"}) + continue + + pulled.append(tag) + + details["missing"] = missing + details["pulled"] = pulled + details["pull_errors"] = pull_errors + details["used_auth"] = bool(gh_user and gh_token) + return len(missing) == 0, details + + def main() -> None: import argparse From ab15ea7586bf7bdc89fff1b026685d34cb67a24a Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 16 Jan 2026 16:42:27 +0100 Subject: [PATCH 20/32] Revert "Fallback to micromamba when prebaked swtbench eval images missing" This reverts commit d10418cf018c95f6688cef1ebc30618abd602529. --- benchmarks/swtbench/eval_infer.py | 51 ++-------------- benchmarks/swtbench/image_utils.py | 97 ------------------------------ 2 files changed, 5 insertions(+), 143 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 1476131e..d98dbdde 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,10 +18,7 @@ from pathlib import Path from time import monotonic -from benchmarks.swtbench.image_utils import ( - ensure_swt_bench_repo, - pull_prebaked_eval_images, -) +from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -249,51 +246,13 @@ def run_swtbench_evaluation( logger.info(f"Running SWT-Bench evaluation on {predictions_file}") try: - predictions_path = Path(predictions_file).resolve() - predictions_filename = predictions_path.name - swt_bench_dir = ensure_swt_bench_repo() - prebaked_ok, prebaked_details = pull_prebaked_eval_images( - predictions_path, dataset, split=os.getenv("SWT_BENCH_SPLIT", "test") - ) - if prebaked_ok: - logger.info( - "Using prebaked SWT-Bench eval images from %s (%s pulled).", - prebaked_details.get("prefix"), - len(prebaked_details.get("pulled", [])), - ) - else: - missing = prebaked_details.get("missing", []) - sample_missing = ", ".join( - (m.get("remote") or m.get("tag", "")) - + (f" [{m.get('reason')}]" if m.get("reason") else "") - for m in missing[:5] - ) - logger.warning( - "Prebaked SWT-Bench eval images unavailable; falling back to micromamba builds. " - "prefix=%s dataset=%s split=%s required=%s missing=%s sample_missing=%s auth=%s detail=%s", - prebaked_details.get("prefix"), - dataset, - prebaked_details.get("split"), - prebaked_details.get("required_count"), - len(missing), - sample_missing or "n/a", - "yes" if prebaked_details.get("used_auth") else "no", - prebaked_details.get("error") or "missing images", - ) - pull_errors = prebaked_details.get("pull_errors") or [] - if pull_errors: - logger.info( - "Pull/tag issues (truncated): %s", - "; ".join( - f"{err.get('remote')}: {err.get('reason')}" - + (f" ({err.get('error')})" if err.get("error") else "") - for err in pull_errors[:3] - ), - ) + patch_swt_bench_for_micromamba(swt_bench_dir) - patch_swt_bench_for_micromamba(swt_bench_dir) + # Get the directory and filename of the predictions file + predictions_path = Path(predictions_file).resolve() + predictions_filename = predictions_path.name # Copy predictions file to swt-bench directory swt_predictions_file = swt_bench_dir / predictions_filename diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index 76f7d6fc..b76ac5ec 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -2,18 +2,15 @@ import json import logging -import os import subprocess import sys from pathlib import Path from typing import Iterable -from benchmarks.utils.image_utils import image_exists from openhands.sdk import get_logger logger = get_logger(__name__) -DEFAULT_EVAL_IMAGE_PREFIX = "ghcr.io/openhands/swtbench-eval" def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path: @@ -129,100 +126,6 @@ def format_images_plain(images: Iterable[str]) -> str: return "\n".join(sorted(images)) -def _run_docker(cmd: list[str]) -> tuple[bool, str]: - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - return False, (result.stderr or result.stdout or "").strip() - return True, (result.stdout or "").strip() - - -def pull_prebaked_eval_images( - predictions_file: Path, - dataset: str, - split: str, - *, - image_prefix: str | None = None, - gh_username: str | None = None, - gh_pat: str | None = None, -) -> tuple[bool, dict]: - """ - Attempt to pull prebaked SWT-bench eval base/env images from a registry. - - Returns (all_available, details_dict). - """ - prefix = ( - image_prefix - or os.getenv("SWT_BENCH_EVAL_IMAGE_PREFIX") - or DEFAULT_EVAL_IMAGE_PREFIX - ).rstrip("/") - details: dict = { - "prefix": prefix, - "dataset": dataset, - "split": split, - } - - if not prefix: - details["error"] = "empty_prefix" - return False, details - - try: - base_images, env_images = compute_required_images( - predictions_file, dataset, split - ) - except Exception as exc: # pragma: no cover - network/FS issues - details["error"] = f"compute_failed: {exc}" - return False, details - - required = sorted(base_images | env_images) - details["required_count"] = len(required) - if not required: - details["error"] = "no_required_images" - return False, details - - gh_user = gh_username or os.getenv("GHCR_USERNAME") or os.getenv("GITHUB_ACTOR") - gh_token = gh_pat or os.getenv("GHCR_PAT") - - missing: list[dict] = [] - pulled: list[str] = [] - pull_errors: list[dict] = [] - - for tag in required: - remote_tag = f"{prefix}/{tag}" - exists = image_exists(remote_tag, gh_username=gh_user, gh_pat=gh_token) - if not exists: - missing.append({"remote": remote_tag, "tag": tag, "reason": "not_found"}) - continue - - ok, err = _run_docker(["docker", "pull", remote_tag]) - if not ok: - pull_errors.append( - { - "remote": remote_tag, - "tag": tag, - "reason": "pull_failed", - "error": err, - } - ) - missing.append({"remote": remote_tag, "tag": tag, "reason": "pull_failed"}) - continue - - ok, err = _run_docker(["docker", "tag", remote_tag, tag]) - if not ok: - pull_errors.append( - {"remote": remote_tag, "tag": tag, "reason": "tag_failed", "error": err} - ) - missing.append({"remote": remote_tag, "tag": tag, "reason": "tag_failed"}) - continue - - pulled.append(tag) - - details["missing"] = missing - details["pulled"] = pulled - details["pull_errors"] = pull_errors - details["used_auth"] = bool(gh_user and gh_token) - return len(missing) == 0, details - - def main() -> None: import argparse From af6d559c6e25577a9c1a6960d7b553063db8cddf Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 16 Jan 2026 16:44:44 +0100 Subject: [PATCH 21/32] Remove unused micromamba patching from swtbench eval Co-authored-by: openhands --- benchmarks/swtbench/eval_infer.py | 53 ------------------------------- 1 file changed, 53 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index d98dbdde..27d68e3e 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -28,57 +28,6 @@ logger = get_logger(__name__) -def patch_swt_bench_for_micromamba(swt_bench_dir: Path) -> None: - """ - Ensure the cached swt-bench checkout uses micromamba for env creation. - Applies small, idempotent text replacements to the upstream sources. - """ - solver_timeout_s = 600 - dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py" - exec_spec_path = swt_bench_dir / "src" / "exec_spec.py" - - if not dockerfiles_path.exists() or not exec_spec_path.exists(): - logger.warning( - "swt-bench sources missing expected files; skipping micromamba patch " - f"(dockerfiles: {dockerfiles_path.exists()}, exec_spec: {exec_spec_path.exists()})" - ) - return - - dockerfiles_text = dockerfiles_path.read_text() - dockerfiles_updated = dockerfiles_text.replace( - "RUN conda config --append channels conda-forge\n\nRUN adduser", - "RUN conda config --append channels conda-forge\n" - "# Use micromamba for faster solver performance during env builds\n" - "RUN conda install -n base -c conda-forge -y micromamba \\\n" - " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n" - "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n" - "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n" - "RUN adduser", - ) - - exec_spec_text = exec_spec_path.read_text() - replacements = { - "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ", - "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ", - "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file", - "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f", - "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=", - } - for old, new in replacements.items(): - exec_spec_text = exec_spec_text.replace(old, new) - - if dockerfiles_text != dockerfiles_updated: - dockerfiles_path.write_text(dockerfiles_updated) - logger.info("Patched swt-bench Dockerfile template to install micromamba.") - if exec_spec_path.read_text() != exec_spec_text: - exec_spec_path.write_text(exec_spec_text) - logger.info( - "Patched swt-bench exec_spec to create/update envs with micromamba " - "and a %ss timeout on solver calls.", - solver_timeout_s, - ) - - def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: instance_ids: list[str] = [] seen = set() @@ -248,8 +197,6 @@ def run_swtbench_evaluation( try: swt_bench_dir = ensure_swt_bench_repo() - patch_swt_bench_for_micromamba(swt_bench_dir) - # Get the directory and filename of the predictions file predictions_path = Path(predictions_file).resolve() predictions_filename = predictions_path.name From 1531ce5d90dc5733ef68bd27555d4ab0f3edf5e5 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 16 Jan 2026 17:08:59 +0100 Subject: [PATCH 22/32] Remove unused eval arch override path for swtbench prebaked images Co-authored-by: openhands --- .github/workflows/build-swtbench-images.yml | 12 ------------ benchmarks/swtbench/build_eval_env_images.py | 13 +------------ 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index c375f519..f48587ae 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -44,14 +44,6 @@ on: required: false default: 'ghcr.io/openhands/swtbench-eval' type: string - eval-arch: - description: 'Architecture for prebaked eval images' - required: false - default: 'x86_64' - type: choice - options: - - x86_64 - - arm64 max-retries: description: 'Retries per batch for eval env builds' required: false @@ -202,7 +194,6 @@ jobs: N_LIMIT="${{ inputs.n-limit || '0' }}" INSTANCE_IDS="${{ inputs.instance-ids }}" IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}" - EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}" MAX_WORKERS="${{ inputs.max-workers || '4' }}" BUILD_MODE="${{ inputs.build-mode || 'cli' }}" MAX_RETRIES="${{ inputs.max-retries || '2' }}" @@ -234,9 +225,6 @@ jobs: else ARGS+=(--eval-limit "${N_LIMIT}") fi - if [ -n "${EVAL_ARCH}" ]; then - ARGS+=(--arch "${EVAL_ARCH}") - fi echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}" DOCKER_BUILDKIT=1 \ diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index b147c112..079ad66c 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -280,12 +280,6 @@ def main() -> None: default="ghcr.io/openhands/swtbench-eval", help="Registry prefix for pushed images", ) - parser.add_argument( - "--arch", - choices=["x86_64", "arm64", ""], - default="", - help="Force architecture for built images (defaults to host arch)", - ) parser.add_argument( "--max-workers", type=int, @@ -337,11 +331,6 @@ def main() -> None: exec_specs = load_exec_specs( swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True ) - if args.arch: - for spec in exec_specs: - spec.arch = args.arch - logger.info("Overrode ExecSpec architecture to %s", args.arch) - build_env_images( exec_specs, max_workers=args.max_workers, @@ -362,7 +351,7 @@ def main() -> None: "base_images": sorted(base_images), "env_images": sorted(env_images), "image_prefix": args.image_prefix, - "arch": args.arch or "host", + "arch": "host", } print(json.dumps(manifest, indent=2)) From b890251275f60c5b8b2b070342837a1d86c411c9 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 12:41:28 +0100 Subject: [PATCH 23/32] Add prebaked image pull fallback and force conda solver --- benchmarks/swtbench/eval_infer.py | 97 ++++++++++++++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 27d68e3e..2b8684a1 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -18,7 +18,10 @@ from pathlib import Path from time import monotonic -from benchmarks.swtbench.image_utils import ensure_swt_bench_repo +from benchmarks.swtbench.image_utils import ( + compute_required_images, + ensure_swt_bench_repo, +) from benchmarks.utils.laminar import LaminarService from benchmarks.utils.patch_utils import remove_files_from_patch from benchmarks.utils.report_costs import generate_cost_report @@ -27,6 +30,8 @@ logger = get_logger(__name__) +PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval" + def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: instance_ids: list[str] = [] @@ -59,6 +64,65 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: return instance_ids +def try_pull_prebaked_images( + predictions_file: Path, + dataset: str, + split: str = "test", + registry: str = PREBAKED_REGISTRY, + *, + filter_swt: bool = True, + is_swt: bool = True, +) -> None: + """ + Best-effort pull of prebaked base/env images; no-op on failure. + """ + try: + base_images, env_images = compute_required_images( + predictions_file, + dataset, + split, + filter_swt=filter_swt, + is_swt=is_swt, + ) + except Exception as exc: # pragma: no cover - defensive + logger.warning("Skipping prebaked image pull (compute failed): %s", exc) + return + + tags = sorted(base_images | env_images) + if not tags: + logger.info("No prebaked images to pull (empty tag set)") + return + + registry = registry.rstrip("/") + for tag in tags: + remote = f"{registry}/{tag}" + logger.info("Attempting to pull prebaked image %s", remote) + try: + pull = subprocess.run( + ["docker", "pull", remote], + capture_output=True, + text=True, + ) + except FileNotFoundError: + logger.warning("Docker not available; skipping prebaked image pull") + return + + if pull.returncode != 0: + logger.warning("Failed to pull %s: %s", remote, pull.stderr.strip()) + continue + + # Tag the remote image with the local name expected by the harness. + tag_res = subprocess.run( + ["docker", "tag", remote, tag], + capture_output=True, + text=True, + ) + if tag_res.returncode != 0: + logger.warning("Failed to tag %s as %s: %s", remote, tag, tag_res.stderr) + else: + logger.info("Pulled and tagged %s -> %s", remote, tag) + + def update_report_with_submitted_instances( report_path: Path, predictions_path: Path ) -> None: @@ -228,6 +292,8 @@ def run_swtbench_evaluation( # Set up environment with PYTHONPATH to include swt-bench directory env = os.environ.copy() env["PYTHONPATH"] = str(swt_bench_dir) + # Force classic conda solver (avoid libmamba plugin issues) + env.setdefault("CONDA_SOLVER", "classic") cmd = [ python_executable, @@ -302,6 +368,12 @@ def main() -> None: "(default: eth-sri/SWT-bench_Verified_bm25_27k_zsp)", ) + parser.add_argument( + "--dataset-split", + default="test", + help="Dataset split to use when computing prebaked images (default: test)", + ) + parser.add_argument( "--output-file", help="Output file for SWT-Bench format " @@ -326,6 +398,19 @@ def main() -> None: help="Number of workers to use when evaluating", ) + parser.add_argument( + "--no-prebaked-pull", + action="store_true", + help="Skip pulling prebaked GHCR SWT-Bench images before evaluation", + ) + + parser.add_argument( + "--prebaked-registry", + default=PREBAKED_REGISTRY, + help="Registry prefix for prebaked SWT-Bench images " + f"(default: {PREBAKED_REGISTRY})", + ) + args = parser.parse_args() # Validate input file @@ -346,12 +431,22 @@ def main() -> None: logger.info(f"Input file: {input_file}") logger.info(f"Output file: {output_file}") logger.info(f"Dataset: {args.dataset}") + logger.info(f"Dataset split: {args.dataset_split}") logger.info(f"Model name: {args.model_name}") try: # Convert format convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) + if not args.no_prebaked_pull: + try_pull_prebaked_images( + output_file, + args.dataset, + split=args.dataset_split, + registry=args.prebaked_registry, + is_swt=True, + ) + if not args.skip_evaluation: eval_phase_start = monotonic() # Run evaluation From 9daea9f23fffb6cca3560c19ceae52d75489b112 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 12:42:41 +0100 Subject: [PATCH 24/32] Simplify prebaked toggle via env vars --- benchmarks/swtbench/eval_infer.py | 37 ++++++++++++------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 2b8684a1..9fec69c6 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -368,12 +368,6 @@ def main() -> None: "(default: eth-sri/SWT-bench_Verified_bm25_27k_zsp)", ) - parser.add_argument( - "--dataset-split", - default="test", - help="Dataset split to use when computing prebaked images (default: test)", - ) - parser.add_argument( "--output-file", help="Output file for SWT-Bench format " @@ -398,19 +392,6 @@ def main() -> None: help="Number of workers to use when evaluating", ) - parser.add_argument( - "--no-prebaked-pull", - action="store_true", - help="Skip pulling prebaked GHCR SWT-Bench images before evaluation", - ) - - parser.add_argument( - "--prebaked-registry", - default=PREBAKED_REGISTRY, - help="Registry prefix for prebaked SWT-Bench images " - f"(default: {PREBAKED_REGISTRY})", - ) - args = parser.parse_args() # Validate input file @@ -431,21 +412,31 @@ def main() -> None: logger.info(f"Input file: {input_file}") logger.info(f"Output file: {output_file}") logger.info(f"Dataset: {args.dataset}") - logger.info(f"Dataset split: {args.dataset_split}") logger.info(f"Model name: {args.model_name}") try: # Convert format convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) - if not args.no_prebaked_pull: + # Default: attempt to use prebaked images; allow opting out via env. + use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( + "1", + "true", + "yes", + ) + prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY) + prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test") + + if use_prebaked: try_pull_prebaked_images( output_file, args.dataset, - split=args.dataset_split, - registry=args.prebaked_registry, + split=prebaked_split, + registry=prebaked_registry, is_swt=True, ) + else: + logger.info("SWTBENCH_FORCE_CONDA set; skipping prebaked image pull") if not args.skip_evaluation: eval_phase_start = monotonic() From 9d0a410cda1dc9ab75a670ffa7381d619a157a9d Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 15:59:41 +0100 Subject: [PATCH 25/32] Add legacy opt-out path for swtbench prebaked pull --- benchmarks/swtbench/eval_infer.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 9fec69c6..aab7c617 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -242,6 +242,7 @@ def run_swtbench_evaluation( predictions_file: str, dataset: str = "eth-sri/SWT-bench_Verified_bm25_27k_zsp", workers: str = "12", + use_legacy: bool = False, ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -256,7 +257,10 @@ def run_swtbench_evaluation( dataset: SWT-Bench dataset to evaluate against workers: Number of workers to use for evaluation """ - logger.info(f"Running SWT-Bench evaluation on {predictions_file}") + mode = "legacy-conda" if use_legacy else "prebaked-images" + logger.info( + "Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode + ) try: swt_bench_dir = ensure_swt_bench_repo() @@ -292,8 +296,6 @@ def run_swtbench_evaluation( # Set up environment with PYTHONPATH to include swt-bench directory env = os.environ.copy() env["PYTHONPATH"] = str(swt_bench_dir) - # Force classic conda solver (avoid libmamba plugin issues) - env.setdefault("CONDA_SOLVER", "classic") cmd = [ python_executable, @@ -419,11 +421,12 @@ def main() -> None: convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) # Default: attempt to use prebaked images; allow opting out via env. - use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( + force_conda = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ( "1", "true", "yes", ) + use_prebaked = not force_conda prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY) prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test") @@ -436,12 +439,20 @@ def main() -> None: is_swt=True, ) else: - logger.info("SWTBENCH_FORCE_CONDA set; skipping prebaked image pull") + logger.info( + "SWTBENCH_FORCE_CONDA set; skipping prebaked image pull " + "and using legacy (pre-mamba) evaluation flow" + ) if not args.skip_evaluation: eval_phase_start = monotonic() # Run evaluation - run_swtbench_evaluation(str(output_file), args.dataset, args.workers) + run_swtbench_evaluation( + str(output_file), + args.dataset, + args.workers, + use_legacy=force_conda, + ) eval_phase_end = monotonic() cleanup_phase_start = monotonic() From b7c608e0211209dc12e210f771ccaf52196aa80c Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 16:01:48 +0100 Subject: [PATCH 26/32] Simplify prebaked toggle env handling --- benchmarks/swtbench/eval_infer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index aab7c617..84b00487 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -420,13 +420,12 @@ def main() -> None: # Convert format convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) - # Default: attempt to use prebaked images; allow opting out via env. - force_conda = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ( + # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. + use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( "1", "true", "yes", ) - use_prebaked = not force_conda prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY) prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test") @@ -451,7 +450,7 @@ def main() -> None: str(output_file), args.dataset, args.workers, - use_legacy=force_conda, + use_legacy=not use_prebaked, ) eval_phase_end = monotonic() From 5167fbdc55ab8d46809e1070e023915c0b2cdd8d Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 16:05:20 +0100 Subject: [PATCH 27/32] Hardcode prebaked params and simplify toggle --- benchmarks/swtbench/eval_infer.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 84b00487..99ee4e98 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -421,13 +421,9 @@ def main() -> None: convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. - use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( - "1", - "true", - "yes", - ) - prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY) - prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test") + use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ("1", "true", "yes") + prebaked_registry = PREBAKED_REGISTRY + prebaked_split = "test" if use_prebaked: try_pull_prebaked_images( From 1a5a7da8ec9523711249f4b22f449b1c78bbb017 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 16:08:37 +0100 Subject: [PATCH 28/32] Read legacy toggle from env in eval runner --- benchmarks/swtbench/eval_infer.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 99ee4e98..04fcf541 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -242,7 +242,6 @@ def run_swtbench_evaluation( predictions_file: str, dataset: str = "eth-sri/SWT-bench_Verified_bm25_27k_zsp", workers: str = "12", - use_legacy: bool = False, ) -> None: """ Run SWT-Bench evaluation on the predictions file. @@ -257,6 +256,7 @@ def run_swtbench_evaluation( dataset: SWT-Bench dataset to evaluate against workers: Number of workers to use for evaluation """ + use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes") mode = "legacy-conda" if use_legacy else "prebaked-images" logger.info( "Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode @@ -442,12 +442,7 @@ def main() -> None: if not args.skip_evaluation: eval_phase_start = monotonic() # Run evaluation - run_swtbench_evaluation( - str(output_file), - args.dataset, - args.workers, - use_legacy=not use_prebaked, - ) + run_swtbench_evaluation(str(output_file), args.dataset, args.workers) eval_phase_end = monotonic() cleanup_phase_start = monotonic() From 1e6f82fad60e6111c65671d51df021b7342680a3 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 16:25:55 +0100 Subject: [PATCH 29/32] Default SWT image computation to SWT dataset --- benchmarks/swtbench/image_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index b76ac5ec..933d78ed 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -71,7 +71,6 @@ def compute_required_images( split: str, *, filter_swt: bool = True, - is_swt: bool = False, ) -> tuple[set[str], set[str]]: """ Compute the base/env image tags required to evaluate the given predictions file. @@ -91,7 +90,7 @@ def compute_required_images( from src.exec_spec import make_exec_spec # type: ignore[import-not-found] dataset_entries = load_swebench_dataset( - name=dataset, split=split, is_swt=is_swt, filter_swt=filter_swt + name=dataset, split=split, is_swt=True, filter_swt=filter_swt ) entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries} From 8974cdfb1cfc97da32c0d38ee35b3488038596a6 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 16:28:48 +0100 Subject: [PATCH 30/32] Hide SWT filter flag and hardcode SWT dataset mode --- benchmarks/swtbench/eval_infer.py | 6 ------ benchmarks/swtbench/image_utils.py | 10 +--------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 04fcf541..bdd67ac4 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -69,9 +69,6 @@ def try_pull_prebaked_images( dataset: str, split: str = "test", registry: str = PREBAKED_REGISTRY, - *, - filter_swt: bool = True, - is_swt: bool = True, ) -> None: """ Best-effort pull of prebaked base/env images; no-op on failure. @@ -81,8 +78,6 @@ def try_pull_prebaked_images( predictions_file, dataset, split, - filter_swt=filter_swt, - is_swt=is_swt, ) except Exception as exc: # pragma: no cover - defensive logger.warning("Skipping prebaked image pull (compute failed): %s", exc) @@ -431,7 +426,6 @@ def main() -> None: args.dataset, split=prebaked_split, registry=prebaked_registry, - is_swt=True, ) else: logger.info( diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py index 933d78ed..e7aae1f4 100644 --- a/benchmarks/swtbench/image_utils.py +++ b/benchmarks/swtbench/image_utils.py @@ -69,8 +69,6 @@ def compute_required_images( output_jsonl: Path, dataset: str, split: str, - *, - filter_swt: bool = True, ) -> tuple[set[str], set[str]]: """ Compute the base/env image tags required to evaluate the given predictions file. @@ -90,7 +88,7 @@ def compute_required_images( from src.exec_spec import make_exec_spec # type: ignore[import-not-found] dataset_entries = load_swebench_dataset( - name=dataset, split=split, is_swt=True, filter_swt=filter_swt + name=dataset, split=split, is_swt=True, filter_swt=True ) entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries} @@ -134,11 +132,6 @@ def main() -> None: parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl") parser.add_argument("--dataset", required=True, help="Dataset name") parser.add_argument("--split", default="test", help="Dataset split") - parser.add_argument( - "--no-filter-swt", - action="store_true", - help="Disable SWT filtering when loading the dataset", - ) parser.add_argument( "--format", choices=["plain", "json"], @@ -151,7 +144,6 @@ def main() -> None: args.output_jsonl, args.dataset, args.split, - filter_swt=not args.no_filter_swt, ) payload = { "base": sorted(base_images), From bde150201a48461cfaf83c98c64bcbcd9a4ca639 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 16:32:00 +0100 Subject: [PATCH 31/32] Use prebaked pull defaults --- benchmarks/swtbench/eval_infer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index bdd67ac4..db364a85 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -417,15 +417,10 @@ def main() -> None: # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ("1", "true", "yes") - prebaked_registry = PREBAKED_REGISTRY - prebaked_split = "test" - if use_prebaked: try_pull_prebaked_images( output_file, args.dataset, - split=prebaked_split, - registry=prebaked_registry, ) else: logger.info( From 968c4b38277dccff37411274f01ab08431d57bb2 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Sat, 17 Jan 2026 17:43:25 +0100 Subject: [PATCH 32/32] Run pre-commit formatting --- benchmarks/swtbench/eval_infer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index db364a85..9dc7062b 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -253,9 +253,7 @@ def run_swtbench_evaluation( """ use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes") mode = "legacy-conda" if use_legacy else "prebaked-images" - logger.info( - "Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode - ) + logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode) try: swt_bench_dir = ensure_swt_bench_repo() @@ -416,7 +414,11 @@ def main() -> None: convert_to_swtbench_format(str(input_file), str(output_file), args.model_name) # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow. - use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ("1", "true", "yes") + use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ( + "1", + "true", + "yes", + ) if use_prebaked: try_pull_prebaked_images( output_file,