From b6ed901db87356b15cb73795181b856acd9ecd75 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 14 Jan 2026 23:33:43 +0100
Subject: [PATCH 01/32] Add prebaked SWT-bench eval image build support

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../workflows/build-swtbench-eval-images.yml  | 116 +++++++++
 benchmarks/swtbench/build_eval_env_images.py  | 230 ++++++++++++++++++
 benchmarks/swtbench/eval_infer.py             |  27 +-
 benchmarks/swtbench/image_utils.py            | 224 +++++++++++++++++
 pyproject.toml                                |   2 +
 uv.lock                                       |  77 +++---
 6 files changed, 618 insertions(+), 58 deletions(-)
 create mode 100644 .github/workflows/build-swtbench-eval-images.yml
 create mode 100644 benchmarks/swtbench/build_eval_env_images.py
 create mode 100644 benchmarks/swtbench/image_utils.py

diff --git a/.github/workflows/build-swtbench-eval-images.yml b/.github/workflows/build-swtbench-eval-images.yml
new file mode 100644
index 00000000..d669777d
--- /dev/null
+++ b/.github/workflows/build-swtbench-eval-images.yml
@@ -0,0 +1,116 @@
+name: Build SWT-Bench Eval Images
+
+on:
+  workflow_dispatch:
+    inputs:
+      dataset:
+        description: "Dataset name"
+        required: true
+        default: "eth-sri/SWT-bench_Verified_bm25_27k_zsp"
+        type: string
+      split:
+        description: "Dataset split"
+        required: true
+        default: "test"
+        type: string
+      eval-limit:
+        description: "Number of instances to match inference sampling (0 to disable)"
+        required: false
+        default: "1"
+        type: string
+      instance-ids:
+        description: "Comma-separated instance IDs to force (overrides eval-limit)"
+        required: false
+        default: ""
+        type: string
+      image-prefix:
+        description: "Registry prefix for pushed images"
+        required: false
+        default: "ghcr.io/openhands/swtbench-eval"
+        type: string
+      max-workers:
+        description: "Maximum parallel env builds"
+        required: false
+        default: "4"
+        type: string
+      build-mode:
+        description: "swt-bench build mode"
+        required: false
+        default: "api"
+        type: choice
+        options:
+          - api
+          - cli
+
+concurrency:
+  group: build-swtbench-eval-${{ github.ref }}
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on:
+      labels: blacksmith-32vcpu-ubuntu-2204
+    permissions:
+      contents: read
+      packages: write
+      actions: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+        with:
+          submodules: recursive
+
+      - name: Set up Docker Buildx
+        uses: useblacksmith/setup-docker-builder@v1
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+
+      - name: Install dependencies
+        run: make build
+
+      - name: Build and push prebaked eval env images
+        env:
+          DATASET: ${{ inputs.dataset }}
+          SPLIT: ${{ inputs.split }}
+          EVAL_LIMIT: ${{ inputs.eval-limit }}
+          INSTANCE_IDS: ${{ inputs.instance-ids }}
+          IMAGE_PREFIX: ${{ inputs.image-prefix }}
+          MAX_WORKERS: ${{ inputs.max-workers }}
+          BUILD_MODE: ${{ inputs.build-mode }}
+        run: |
+          set -euo pipefail
+          ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}")
+          if [ -n "${INSTANCE_IDS}" ]; then
+            ARGS+=(--instance-ids "${INSTANCE_IDS}")
+          else
+            ARGS+=(--eval-limit "${EVAL_LIMIT}")
+          fi
+          uv run swtbench-build-eval-images "${ARGS[@]}"
+
+      - name: Make image package public (best-effort)
+        if: github.repository_owner == 'OpenHands'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IMAGE_PREFIX: ${{ inputs.image-prefix }}
+        run: |
+          set -euo pipefail
+          NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}')
+          if [ -z "$NAME" ]; then
+            echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update"
+            exit 0
+          fi
+          gh api -X PATCH \
+            -H "Accept: application/vnd.github+json" \
+            /user/packages/container/${NAME}/visibility \
+            -f visibility=public || echo "Warning: failed to set package visibility"
diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
new file mode 100644
index 00000000..4652a2bb
--- /dev/null
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -0,0 +1,230 @@
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Iterable
+
+import docker
+
+from benchmarks.swtbench.image_utils import (
+    ensure_swt_bench_repo,
+    patch_swt_bench_for_micromamba,
+)
+from benchmarks.utils.dataset import get_dataset
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def select_instance_ids(
+    dataset: str,
+    split: str,
+    eval_limit: int | None,
+    selected_instances_file: str | None,
+    instance_ids: list[str] | None,
+) -> list[str]:
+    """
+    Select the instance IDs that match the inference sampling logic.
+    """
+    if instance_ids:
+        return instance_ids
+
+    df = get_dataset(
+        dataset_name=dataset,
+        split=split,
+        eval_limit=eval_limit,
+        selected_instances_file=selected_instances_file,
+    )
+    ids = df["instance_id"].tolist()
+    if not ids:
+        raise RuntimeError("No instances selected for image build.")
+    logger.info("Selected %s instances for image build", len(ids))
+    return ids
+
+
+def load_exec_specs(
+    swt_bench_dir: Path,
+    dataset: str,
+    split: str,
+    instance_ids: Iterable[str],
+    filter_swt: bool = True,
+) -> list:
+    """
+    Load ExecSpec objects for the provided instance IDs.
+    """
+    sys.path.insert(0, str(swt_bench_dir / "src"))
+    sys.path.insert(0, str(swt_bench_dir))
+    from src.dataset import load_swebench_dataset  # type: ignore[import-not-found]
+    from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
+
+    dataset_entries = load_swebench_dataset(
+        name=dataset, split=split, is_swt=False, filter_swt=filter_swt
+    )
+    by_id = {entry["instance_id"]: entry for entry in dataset_entries}
+
+    specs = []
+    missing = []
+    for iid in instance_ids:
+        if iid not in by_id:
+            missing.append(iid)
+            continue
+        specs.append(make_exec_spec(by_id[iid]))
+
+    if missing:
+        logger.warning(
+            "Skipped %s missing instance_ids not found in dataset: %s",
+            len(missing),
+            ", ".join(missing[:5]),
+        )
+    if not specs:
+        raise RuntimeError("No ExecSpecs available after filtering instance IDs.")
+    return specs
+
+
+def build_env_images(exec_specs: list, max_workers: int, build_mode: str) -> None:
+    """
+    Build base + environment images required by the provided ExecSpecs.
+    """
+    from src.docker_build import (  # type: ignore[import-not-found]
+        build_base_images,
+        build_env_images as build_envs,
+    )
+
+    client = docker.from_env()
+    logger.info(
+        "Building %s base images and %s env images (mode=%s, workers=%s)",
+        len({spec.base_image_key for spec in exec_specs}),
+        len({spec.env_image_key for spec in exec_specs}),
+        build_mode,
+        max_workers,
+    )
+    build_base_images(client, exec_specs, force_rebuild=False, build_mode=build_mode)
+    build_envs(
+        client,
+        exec_specs,
+        force_rebuild=False,
+        max_workers=max_workers,
+        build_mode=build_mode,
+    )
+
+
+def tag_and_push(images: Iterable[str], prefix: str) -> list[str]:
+    """
+    Tag the provided images with the registry prefix and push them.
+    """
+    pushed: list[str] = []
+    prefix = prefix.rstrip("/")
+    for image in images:
+        target = f"{prefix}/{image}"
+        logger.info("Pushing %s -> %s", image, target)
+        subprocess_run(["docker", "tag", image, target])
+        subprocess_run(["docker", "push", target])
+        pushed.append(target)
+    return pushed
+
+
+def subprocess_run(cmd: list[str]) -> None:
+    import subprocess
+
+    result = subprocess.run(cmd, text=True, capture_output=True)
+    if result.returncode != 0:
+        logger.error("Command failed (%s): %s", " ".join(cmd), result.stderr)
+        raise RuntimeError(f"Command failed: {' '.join(cmd)}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build and push prebaked SWT-bench eval env images."
+    )
+    parser.add_argument("--dataset", required=True, help="Dataset name")
+    parser.add_argument("--split", default="test", help="Dataset split")
+    parser.add_argument(
+        "--eval-limit",
+        type=int,
+        default=1,
+        help="Match inference sampling by limiting instances (0 to disable)",
+    )
+    parser.add_argument(
+        "--instance-ids",
+        default="",
+        help="Comma-separated instance IDs to force (overrides eval-limit)",
+    )
+    parser.add_argument(
+        "--selected-instances-file",
+        default="",
+        help="Optional selected instances file used during inference",
+    )
+    parser.add_argument(
+        "--image-prefix",
+        default="ghcr.io/openhands/swtbench-eval",
+        help="Registry prefix for pushed images",
+    )
+    parser.add_argument(
+        "--max-workers",
+        type=int,
+        default=4,
+        help="Parallel builds for env images",
+    )
+    parser.add_argument(
+        "--build-mode",
+        choices=["api", "cli"],
+        default="api",
+        help="swt-bench build mode",
+    )
+    parser.add_argument(
+        "--no-push",
+        action="store_true",
+        help="Build images locally without pushing to the registry",
+    )
+    args = parser.parse_args()
+
+    instance_ids = (
+        [iid for iid in args.instance_ids.split(",") if iid]
+        if args.instance_ids
+        else None
+    )
+    eval_limit = None if instance_ids else args.eval_limit
+    selected_file = args.selected_instances_file or None
+
+    swt_bench_dir = ensure_swt_bench_repo()
+    patch_swt_bench_for_micromamba(swt_bench_dir)
+
+    target_ids = select_instance_ids(
+        dataset=args.dataset,
+        split=args.split,
+        eval_limit=eval_limit,
+        selected_instances_file=selected_file,
+        instance_ids=instance_ids,
+    )
+    exec_specs = load_exec_specs(
+        swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True
+    )
+
+    build_env_images(
+        exec_specs, max_workers=args.max_workers, build_mode=args.build_mode
+    )
+
+    base_images = {spec.base_image_key for spec in exec_specs}
+    env_images = {spec.env_image_key for spec in exec_specs}
+    logger.info("Built images: %s base, %s env", len(base_images), len(env_images))
+
+    if not args.no_push:
+        pushed = tag_and_push(base_images | env_images, args.image_prefix)
+        logger.info("Pushed %s images", len(pushed))
+
+    manifest = {
+        "dataset": args.dataset,
+        "split": args.split,
+        "instances": target_ids,
+        "base_images": sorted(base_images),
+        "env_images": sorted(env_images),
+        "image_prefix": args.image_prefix,
+    }
+    print(json.dumps(manifest, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 94cb120a..4be058fb 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -17,6 +17,10 @@
 import sys
 from pathlib import Path
 
+from benchmarks.swtbench.image_utils import (
+    ensure_swt_bench_repo,
+    patch_swt_bench_for_micromamba,
+)
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -193,27 +197,8 @@ def run_swtbench_evaluation(
     logger.info(f"Running SWT-Bench evaluation on {predictions_file}")
 
     try:
-        # Use a global cache directory for SWT-Bench source
-        cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench"
-        swt_bench_dir = cache_dir / "swt-bench"
-
-        # Clone SWT-Bench repository if it doesn't exist
-        if not swt_bench_dir.exists():
-            logger.info("Setting up SWT-Bench source in global cache...")
-            cache_dir.mkdir(parents=True, exist_ok=True)
-
-            logger.info("Cloning SWT-Bench repository...")
-            clone_cmd = [
-                "git",
-                "clone",
-                "https://github.com/logic-star-ai/swt-bench.git",
-                str(swt_bench_dir),
-            ]
-            result = subprocess.run(clone_cmd, text=True)
-            if result.returncode != 0:
-                raise subprocess.CalledProcessError(result.returncode, clone_cmd)
-
-            logger.info(f"SWT-Bench source installed at {swt_bench_dir}")
+        swt_bench_dir = ensure_swt_bench_repo()
+        patch_swt_bench_for_micromamba(swt_bench_dir)
 
         # Get the directory and filename of the predictions file
         predictions_path = Path(predictions_file).resolve()
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
new file mode 100644
index 00000000..f855b272
--- /dev/null
+++ b/benchmarks/swtbench/image_utils.py
@@ -0,0 +1,224 @@
+from __future__ import annotations
+
+import json
+import logging
+import subprocess
+import sys
+from pathlib import Path
+from typing import Iterable
+
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path:
+    """
+    Ensure the SWT-bench sources are available locally.
+
+    Returns the repository path under the cache directory.
+    """
+    cache_dir = cache_dir or Path.home() / ".cache" / "openhands" / "swt-bench"
+    swt_bench_dir = cache_dir / "swt-bench"
+
+    if swt_bench_dir.exists():
+        return swt_bench_dir
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("Cloning SWT-Bench repository into %s", swt_bench_dir)
+    result = subprocess.run(
+        [
+            "git",
+            "clone",
+            "https://github.com/logic-star-ai/swt-bench.git",
+            str(swt_bench_dir),
+        ],
+        text=True,
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        logger.error("Failed to clone swt-bench: %s", result.stderr)
+        raise RuntimeError("Unable to clone swt-bench repository")
+
+    return swt_bench_dir
+
+
+def patch_swt_bench_for_micromamba(
+    swt_bench_dir: Path, solver_timeout_s: int = 300
+) -> None:
+    """
+    Patch the cached swt-bench checkout to use micromamba with timeouts when
+    building environments. Idempotent: safe to call multiple times.
+    """
+    dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py"
+    exec_spec_path = swt_bench_dir / "src" / "exec_spec.py"
+
+    if not dockerfiles_path.exists() or not exec_spec_path.exists():
+        logger.warning(
+            "swt-bench sources missing expected files; skipping micromamba patch "
+            "(dockerfiles: %s, exec_spec: %s)",
+            dockerfiles_path.exists(),
+            exec_spec_path.exists(),
+        )
+        return
+
+    dockerfiles_text = dockerfiles_path.read_text()
+    dockerfiles_updated = dockerfiles_text.replace(
+        "RUN conda config --append channels conda-forge\n\nRUN adduser",
+        "RUN conda config --append channels conda-forge\n"
+        "# Use micromamba for faster solver performance during env builds\n"
+        "RUN conda install -n base -c conda-forge -y micromamba \\\n"
+        " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n"
+        "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n"
+        "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n"
+        "RUN adduser",
+    )
+
+    exec_spec_text = exec_spec_path.read_text()
+    replacements = {
+        "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ",
+        "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ",
+        "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file",
+        "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f",
+        "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=",
+    }
+    for old, new in replacements.items():
+        exec_spec_text = exec_spec_text.replace(old, new)
+
+    if dockerfiles_text != dockerfiles_updated:
+        dockerfiles_path.write_text(dockerfiles_updated)
+        logger.info("Patched swt-bench Dockerfile template to install micromamba.")
+    if exec_spec_path.read_text() != exec_spec_text:
+        exec_spec_path.write_text(exec_spec_text)
+        logger.info(
+            "Patched swt-bench exec_spec to use micromamba with a %ss timeout.",
+            solver_timeout_s,
+        )
+
+
+def _load_instance_ids(output_jsonl: Path) -> list[str]:
+    instance_ids: list[str] = []
+    seen = set()
+    with output_jsonl.open("r", encoding="utf-8") as infile:
+        for line_num, line in enumerate(infile, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                logger.debug("Skipping invalid JSON on line %s", line_num)
+                continue
+            instance_id = data.get("instance_id")
+            if not instance_id or instance_id in seen:
+                continue
+            seen.add(instance_id)
+            instance_ids.append(instance_id)
+    return instance_ids
+
+
+def compute_required_images(
+    output_jsonl: Path,
+    dataset: str,
+    split: str,
+    *,
+    filter_swt: bool = True,
+    is_swt: bool = False,
+) -> tuple[set[str], set[str]]:
+    """
+    Compute the base/env image tags required to evaluate the given predictions file.
+
+    Returns (base_image_tags, env_image_tags).
+    """
+    instance_ids = _load_instance_ids(output_jsonl)
+    if not instance_ids:
+        raise ValueError(f"No instance_ids found in {output_jsonl}")
+
+    swt_bench_dir = ensure_swt_bench_repo()
+    sys.path.insert(0, str(swt_bench_dir / "src"))
+    sys.path.insert(0, str(swt_bench_dir))
+
+    # Delay import until after sys.path manipulation so we use the cached checkout.
+    from src.dataset import load_swebench_dataset  # type: ignore[import-not-found]
+    from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
+
+    dataset_entries = load_swebench_dataset(
+        name=dataset, split=split, is_swt=is_swt, filter_swt=filter_swt
+    )
+    entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries}
+
+    missing = [iid for iid in instance_ids if iid not in entries_by_id]
+    if missing:
+        logger.warning(
+            "Predictions reference %s instance_ids not present in dataset: %s",
+            len(missing),
+            ", ".join(missing[:5]),
+        )
+
+    specs = [
+        make_exec_spec(entries_by_id[iid])
+        for iid in instance_ids
+        if iid in entries_by_id
+    ]
+    if not specs:
+        raise RuntimeError("No ExecSpecs produced; cannot compute required images.")
+
+    base_images = {spec.base_image_key for spec in specs}
+    env_images = {spec.env_image_key for spec in specs}
+    logger.info(
+        "Computed %s base images and %s env images for %s instances",
+        len(base_images),
+        len(env_images),
+        len(specs),
+    )
+    return base_images, env_images
+
+
+def format_images_plain(images: Iterable[str]) -> str:
+    return "\n".join(sorted(images))
+
+
+def main() -> None:
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="List SWT-bench base/env images required for a predictions file."
+    )
+    parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl")
+    parser.add_argument("--dataset", required=True, help="Dataset name")
+    parser.add_argument("--split", default="test", help="Dataset split")
+    parser.add_argument(
+        "--no-filter-swt",
+        action="store_true",
+        help="Disable SWT filtering when loading the dataset",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["plain", "json"],
+        default="plain",
+        help="Output format",
+    )
+    args = parser.parse_args()
+
+    base_images, env_images = compute_required_images(
+        args.output_jsonl,
+        args.dataset,
+        args.split,
+        filter_swt=not args.no_filter_swt,
+    )
+    payload = {
+        "base": sorted(base_images),
+        "env": sorted(env_images),
+    }
+
+    if args.format == "json":
+        print(json.dumps(payload))
+    else:
+        print(format_images_plain(payload["base"] + payload["env"]))
+
+
+if __name__ == "__main__":
+    # Configure root logging for ad-hoc usage
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 0ecd0736..a3bf7b10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,6 +49,8 @@ swebench-infer = "benchmarks.swebench.run_infer:main"
 swtbench-infer = "benchmarks.swtbench.run_infer:main"
 swebench-eval = "benchmarks.swebench.eval_infer:main"
 swtbench-eval = "benchmarks.swtbench.eval_infer:main"
+swtbench-list-images = "benchmarks.swtbench.image_utils:main"
+swtbench-build-eval-images = "benchmarks.swtbench.build_eval_env_images:main"
 gaia-infer = "benchmarks.gaia.run_infer:main"
 gaia-eval = "benchmarks.gaia.eval_infer:main"
 commit0-infer = "benchmarks.commit0.run_infer:main"
diff --git a/uv.lock b/uv.lock
index 9639461b..f8c7cb1e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -947,11 +947,11 @@ wheels = [
 
 [[package]]
 name = "filelock"
-version = "3.19.1"
+version = "3.20.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" },
 ]
 
 [[package]]
@@ -1678,11 +1678,11 @@ wheels = [
 
 [[package]]
 name = "libtmux"
-version = "0.46.2"
+version = "0.53.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9c/aa/7e1dcaa097156d6f3a7d8669be4389dced997feeb81744e3ff4681d65ee8/libtmux-0.46.2.tar.gz", hash = "sha256:9a398fec5d714129c8344555d466e1a903dfc0f741ba07aabe75a8ceb25c5dda", size = 346887, upload-time = "2025-05-26T19:40:04.096Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/28/e2b252817cb181aec2f42fe2d1d7fac5ec9c4d15bfb2b8ea4bd1179e4244/libtmux-0.53.0.tar.gz", hash = "sha256:1d19af4cea0c19543954d7e7317c7025c0739b029cccbe3b843212fae238f1bd", size = 405001, upload-time = "2025-12-14T11:59:11.337Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d6/2f/9d207039fcfa00d3b30e4d765f062fbcc42c873c7518a8cfebb3eafd00e0/libtmux-0.46.2-py3-none-any.whl", hash = "sha256:6c32dbf22bde8e5e33b2714a4295f6e838dc640f337cd4c085a044f6828c7793", size = 60873, upload-time = "2025-05-26T19:40:02.284Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/d0/2e8bc5caa639ebb9f8801ba0be7070a28d48d8ed60e2a428d40f71fb88b8/libtmux-0.53.0-py3-none-any.whl", hash = "sha256:024b7ae6a12aae55358e8feb914c8632b3ab9bd61c0987c53559643c6a58ee4f", size = 77582, upload-time = "2025-12-14T11:59:09.739Z" },
 ]
 
 [[package]]
@@ -2269,7 +2269,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2405,11 +2405,12 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "deprecation" },
     { name = "fastmcp" },
+    { name = "filelock" },
     { name = "httpx" },
     { name = "litellm" },
     { name = "lmnr" },
@@ -2430,10 +2431,11 @@ requires-dist = [
     { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" },
     { name = "deprecation", specifier = ">=2.1.0" },
     { name = "fastmcp", specifier = ">=2.11.3" },
+    { name = "filelock", specifier = ">=3.20.1" },
     { name = "httpx", specifier = ">=0.27.0" },
     { name = "litellm", specifier = ">=1.80.10" },
     { name = "lmnr", specifier = ">=0.7.24" },
-    { name = "pydantic", specifier = ">=2.11.7" },
+    { name = "pydantic", specifier = ">=2.12.5" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "tenacity", specifier = ">=9.1.2" },
@@ -2443,7 +2445,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2464,7 +2466,7 @@ requires-dist = [
     { name = "browser-use", specifier = ">=0.8.0" },
     { name = "cachetools" },
     { name = "func-timeout", specifier = ">=4.3.5" },
-    { name = "libtmux", specifier = ">=0.46.2" },
+    { name = "libtmux", specifier = ">=0.53.0" },
     { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" },
     { name = "pydantic", specifier = ">=2.11.7" },
     { name = "tom-swe", specifier = ">=1.0.3" },
@@ -2472,7 +2474,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-agent-server" },
@@ -3138,21 +3140,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" },
     { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" },
     { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
-    { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" },
-    { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" },
-    { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" },
-    { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" },
-    { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
     { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" },
     { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
     { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
@@ -3161,16 +3149,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" },
     { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" },
     { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" },
-    { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:05.804Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:09.827Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:12.379Z" },
-    { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:14.627Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:16.868Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
-    { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
-    { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" },
+    { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" },
+    { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" },
+    { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" },
+    { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" },
+    { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" },
+    { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" },
+    { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
+    { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
+    { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" },
     { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" },
     { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" },
     { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" },

From f00bc1eb0e1975a3f1fed34687b597b290fb8d60 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 14 Jan 2026 23:37:17 +0100
Subject: [PATCH 02/32] Add arch override for SWT-bench eval image builds

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swtbench/build_eval_env_images.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 4652a2bb..b6c07d0e 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -162,6 +162,12 @@ def main() -> None:
         default="ghcr.io/openhands/swtbench-eval",
         help="Registry prefix for pushed images",
     )
+    parser.add_argument(
+        "--arch",
+        choices=["x86_64", "arm64", ""],
+        default="",
+        help="Force architecture for built images (defaults to host arch)",
+    )
     parser.add_argument(
         "--max-workers",
         type=int,
@@ -202,6 +208,10 @@ def main() -> None:
     exec_specs = load_exec_specs(
         swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True
     )
+    if args.arch:
+        for spec in exec_specs:
+            spec.arch = args.arch
+        logger.info("Overrode ExecSpec architecture to %s", args.arch)
 
     build_env_images(
         exec_specs, max_workers=args.max_workers, build_mode=args.build_mode
@@ -222,6 +232,7 @@ def main() -> None:
         "base_images": sorted(base_images),
         "env_images": sorted(env_images),
         "image_prefix": args.image_prefix,
+        "arch": args.arch or "host",
     }
     print(json.dumps(manifest, indent=2))
 

From ee1b5a683825bc07af369cb4bd699c4b7f387710 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 14 Jan 2026 23:39:00 +0100
Subject: [PATCH 03/32] Let swtbench workflow build prebaked eval env images

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swtbench-images.yml | 45 +++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index 1ca70c1b..e819178d 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -34,6 +34,24 @@ on:
         description: 'Software Agent SDK commit/ref to use'
         required: true
         type: string
+      build-eval-env:
+        description: 'Also build prebaked SWT-bench eval env images (default: false)'
+        required: false
+        default: 'false'
+        type: string
+      eval-image-prefix:
+        description: 'Registry prefix for prebaked eval images'
+        required: false
+        default: 'ghcr.io/openhands/swtbench-eval'
+        type: string
+      eval-arch:
+        description: 'Architecture for prebaked eval images'
+        required: false
+        default: 'x86_64'
+        type: choice
+        options:
+          - x86_64
+          - arm64
 
 concurrency:
   group: build-swt-bench-${{ github.ref }}
@@ -158,6 +176,33 @@ jobs:
           DOCKER_BUILDKIT: 1
           BUILDKIT_PROGRESS: plain
 
+      - name: Build prebaked eval env images
+        if: ${{ inputs.build-eval-env == 'true' }}
+        run: |
+          set -euo pipefail
+
+          DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}"
+          SPLIT="${{ inputs.split || 'test' }}"
+          N_LIMIT="${{ inputs.n-limit || '0' }}"
+          INSTANCE_IDS="${{ inputs.instance-ids }}"
+          IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}"
+          EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}"
+          MAX_WORKERS="${{ inputs.max-workers || '4' }}"
+          BUILD_MODE="${{ inputs.build-mode || 'api' }}"
+
+          ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}")
+          if [ -n "${INSTANCE_IDS}" ]; then
+            ARGS+=(--instance-ids "${INSTANCE_IDS}")
+          else
+            ARGS+=(--eval-limit "${N_LIMIT}")
+          fi
+          if [ -n "${EVAL_ARCH}" ]; then
+            ARGS+=(--arch "${EVAL_ARCH}")
+          fi
+
+          echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
+          uv run swtbench-build-eval-images "${ARGS[@]}"
+
       - name: Archive build logs
         if: always()
         run: |

From c0d1432620e2c388c0392f4a8d9e6fc8819ec2c1 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 14 Jan 2026 23:44:31 +0100
Subject: [PATCH 04/32] Expose prebaked eval images publicly in swtbench
 workflow

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swtbench-images.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index e819178d..7f887b2b 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -244,6 +244,23 @@ jobs:
           echo "**Successful:** $SUCCESS_COUNT ✅" >> "$GITHUB_STEP_SUMMARY"
           echo "**Failed:** $FAIL_COUNT ❌" >> "$GITHUB_STEP_SUMMARY"
 
+      - name: Make prebaked eval image package public (best-effort)
+        if: ${{ inputs.build-eval-env == 'true' }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IMAGE_PREFIX: ${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}
+        run: |
+          set -euo pipefail
+          NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}')
+          if [ -z "$NAME" ]; then
+            echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update"
+            exit 0
+          fi
+          gh api -X PATCH \
+            -H "Accept: application/vnd.github+json" \
+            /orgs/OpenHands/packages/container/${NAME}/visibility \
+            -f visibility=public || echo "Warning: failed to set package visibility"
+
       - name: Comment on tracker issue
         if: success()
         run: |

From ce669ad0a5552a70a06707e644bd4864ef73dd5b Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 00:10:57 +0100
Subject: [PATCH 05/32] Make micromamba patch optional for eval image builds

---
 benchmarks/swtbench/build_eval_env_images.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index b6c07d0e..aa269b61 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -8,10 +8,7 @@
 
 import docker
 
-from benchmarks.swtbench.image_utils import (
-    ensure_swt_bench_repo,
-    patch_swt_bench_for_micromamba,
-)
+from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
 from openhands.sdk import get_logger
 
@@ -185,6 +182,12 @@ def main() -> None:
         action="store_true",
         help="Build images locally without pushing to the registry",
     )
+    parser.add_argument(
+        "--use-micromamba",
+        action="store_true",
+        help="Patch swt-bench to use micromamba when building images "
+        "(changes env hash; off by default)",
+    )
     args = parser.parse_args()
 
     instance_ids = (
@@ -196,7 +199,10 @@ def main() -> None:
     selected_file = args.selected_instances_file or None
 
     swt_bench_dir = ensure_swt_bench_repo()
-    patch_swt_bench_for_micromamba(swt_bench_dir)
+    if args.use_micromamba:
+        from benchmarks.swtbench.image_utils import patch_swt_bench_for_micromamba
+
+        patch_swt_bench_for_micromamba(swt_bench_dir)
 
     target_ids = select_instance_ids(
         dataset=args.dataset,

From e523d95cc073a36749c1eef0fa11ca42bd68a7de Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 09:50:19 +0100
Subject: [PATCH 06/32] Drop micromamba fallback in eval image build

---
 benchmarks/swtbench/build_eval_env_images.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index aa269b61..58dc6e4e 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -182,12 +182,6 @@ def main() -> None:
         action="store_true",
         help="Build images locally without pushing to the registry",
     )
-    parser.add_argument(
-        "--use-micromamba",
-        action="store_true",
-        help="Patch swt-bench to use micromamba when building images "
-        "(changes env hash; off by default)",
-    )
     args = parser.parse_args()
 
     instance_ids = (
@@ -199,10 +193,6 @@ def main() -> None:
     selected_file = args.selected_instances_file or None
 
     swt_bench_dir = ensure_swt_bench_repo()
-    if args.use_micromamba:
-        from benchmarks.swtbench.image_utils import patch_swt_bench_for_micromamba
-
-        patch_swt_bench_for_micromamba(swt_bench_dir)
 
     target_ids = select_instance_ids(
         dataset=args.dataset,

From 655d2ce4d61fc20a82d8ec84eab087922747d3f8 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 09:53:55 +0100
Subject: [PATCH 07/32] Remove redundant eval-only workflow

---
 .../workflows/build-swtbench-eval-images.yml  | 116 ------------------
 1 file changed, 116 deletions(-)
 delete mode 100644 .github/workflows/build-swtbench-eval-images.yml

diff --git a/.github/workflows/build-swtbench-eval-images.yml b/.github/workflows/build-swtbench-eval-images.yml
deleted file mode 100644
index d669777d..00000000
--- a/.github/workflows/build-swtbench-eval-images.yml
+++ /dev/null
@@ -1,116 +0,0 @@
-name: Build SWT-Bench Eval Images
-
-on:
-  workflow_dispatch:
-    inputs:
-      dataset:
-        description: "Dataset name"
-        required: true
-        default: "eth-sri/SWT-bench_Verified_bm25_27k_zsp"
-        type: string
-      split:
-        description: "Dataset split"
-        required: true
-        default: "test"
-        type: string
-      eval-limit:
-        description: "Number of instances to match inference sampling (0 to disable)"
-        required: false
-        default: "1"
-        type: string
-      instance-ids:
-        description: "Comma-separated instance IDs to force (overrides eval-limit)"
-        required: false
-        default: ""
-        type: string
-      image-prefix:
-        description: "Registry prefix for pushed images"
-        required: false
-        default: "ghcr.io/openhands/swtbench-eval"
-        type: string
-      max-workers:
-        description: "Maximum parallel env builds"
-        required: false
-        default: "4"
-        type: string
-      build-mode:
-        description: "swt-bench build mode"
-        required: false
-        default: "api"
-        type: choice
-        options:
-          - api
-          - cli
-
-concurrency:
-  group: build-swtbench-eval-${{ github.ref }}
-  cancel-in-progress: false
-
-jobs:
-  build:
-    runs-on:
-      labels: blacksmith-32vcpu-ubuntu-2204
-    permissions:
-      contents: read
-      packages: write
-      actions: read
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-        with:
-          submodules: recursive
-
-      - name: Set up Docker Buildx
-        uses: useblacksmith/setup-docker-builder@v1
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-        with:
-          enable-cache: true
-
-      - name: Install dependencies
-        run: make build
-
-      - name: Build and push prebaked eval env images
-        env:
-          DATASET: ${{ inputs.dataset }}
-          SPLIT: ${{ inputs.split }}
-          EVAL_LIMIT: ${{ inputs.eval-limit }}
-          INSTANCE_IDS: ${{ inputs.instance-ids }}
-          IMAGE_PREFIX: ${{ inputs.image-prefix }}
-          MAX_WORKERS: ${{ inputs.max-workers }}
-          BUILD_MODE: ${{ inputs.build-mode }}
-        run: |
-          set -euo pipefail
-          ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}")
-          if [ -n "${INSTANCE_IDS}" ]; then
-            ARGS+=(--instance-ids "${INSTANCE_IDS}")
-          else
-            ARGS+=(--eval-limit "${EVAL_LIMIT}")
-          fi
-          uv run swtbench-build-eval-images "${ARGS[@]}"
-
-      - name: Make image package public (best-effort)
-        if: github.repository_owner == 'OpenHands'
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          IMAGE_PREFIX: ${{ inputs.image-prefix }}
-        run: |
-          set -euo pipefail
-          NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}')
-          if [ -z "$NAME" ]; then
-            echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update"
-            exit 0
-          fi
-          gh api -X PATCH \
-            -H "Accept: application/vnd.github+json" \
-            /user/packages/container/${NAME}/visibility \
-            -f visibility=public || echo "Warning: failed to set package visibility"

From cfab9a960bc9675209f1557afdcf74cd8d110f21 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 09:56:54 +0100
Subject: [PATCH 08/32] Add verbose diagnostics around eval image build step

---
 .github/workflows/build-swtbench-images.yml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index 7f887b2b..ca07d32a 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -181,6 +181,12 @@ jobs:
         run: |
           set -euo pipefail
 
+          echo "Starting prebaked eval env image build at $(date -u)"
+          echo "Runner: $(uname -a)"
+          df -h
+          docker system df || true
+          docker info || true
+
           DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}"
           SPLIT="${{ inputs.split || 'test' }}"
           N_LIMIT="${{ inputs.n-limit || '0' }}"
@@ -201,7 +207,11 @@ jobs:
           fi
 
           echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
-          uv run swtbench-build-eval-images "${ARGS[@]}"
+          PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log
+
+          echo "Completed prebaked eval env image build at $(date -u)"
+          docker ps -a || true
+          docker system df || true
 
       - name: Archive build logs
         if: always()

From 154a1592ab47e793908fb3abfed6cce21e9dadce Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 10:34:08 +0100
Subject: [PATCH 09/32] Use buildx/cli path for eval images and add runner
 diagnostics

---
 .github/workflows/build-swtbench-images.yml  | 28 +++++++++++++++++++-
 benchmarks/swtbench/build_eval_env_images.py |  2 +-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index ca07d32a..c1a524ab 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -194,7 +194,29 @@ jobs:
           IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}"
           EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}"
           MAX_WORKERS="${{ inputs.max-workers || '4' }}"
-          BUILD_MODE="${{ inputs.build-mode || 'api' }}"
+          BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
+          # Map to docker platform string
+          if [ "${EVAL_ARCH}" = "x86_64" ]; then
+            DOCKER_PLATFORM="linux/amd64"
+          else
+            DOCKER_PLATFORM="linux/${EVAL_ARCH}"
+          fi
+
+          # Basic BuildKit disk guard similar to SWE-bench
+          if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
+            LINE=$(tail -n1 /tmp/buildkit_df)
+            TOTAL=$(echo "$LINE" | awk '{print $2}')
+            USED=$(echo "$LINE" | awk '{print $3}')
+            FREE=$(echo "$LINE" | awk '{print $4}')
+            if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
+              PCT=$(( 100 * USED / TOTAL ))
+              echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
+            else
+              echo "Warning: unable to parse df output for /var/lib/buildkit"
+            fi
+          else
+            echo "Warning: /var/lib/buildkit not found; skipping disk check"
+          fi
 
           ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}")
           if [ -n "${INSTANCE_IDS}" ]; then
@@ -207,6 +229,10 @@ jobs:
           fi
 
           echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
+          DOCKER_DEFAULT_PLATFORM="${DOCKER_PLATFORM}" \
+          DOCKER_BUILDKIT=1 \
+          BUILDKIT_PROGRESS=plain \
+          BUILDKIT_RESET_ON_FAILURE=1 \
           PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log
 
           echo "Completed prebaked eval env image build at $(date -u)"
diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 58dc6e4e..c7040e7b 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -174,7 +174,7 @@ def main() -> None:
     parser.add_argument(
         "--build-mode",
         choices=["api", "cli"],
-        default="api",
+        default="cli",
         help="swt-bench build mode",
     )
     parser.add_argument(

From 6380c419cdcfcfbde5521612256f4f9493cd0deb Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 10:43:35 +0100
Subject: [PATCH 10/32] Add batching/retries and buildx settings to SWT-bench
 eval builds

---
 .github/workflows/build-swtbench-images.yml  | 18 ++++-
 benchmarks/swtbench/build_eval_env_images.py | 77 +++++++++++++++++---
 2 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index c1a524ab..ec72a6bb 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -52,6 +52,16 @@ on:
         options:
           - x86_64
           - arm64
+      max-retries:
+        description: 'Retries per batch for eval env builds'
+        required: false
+        default: '2'
+        type: string
+      build-batch-size:
+        description: 'Env images per batch for eval env builds'
+        required: false
+        default: '10'
+        type: string
 
 concurrency:
   group: build-swt-bench-${{ github.ref }}
@@ -195,6 +205,12 @@ jobs:
           EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}"
           MAX_WORKERS="${{ inputs.max-workers || '4' }}"
           BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
+          MAX_RETRIES="${{ inputs.max-retries || '2' }}"
+          BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}"
+
+          echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV"
+          echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV"
+          echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV"
           # Map to docker platform string
           if [ "${EVAL_ARCH}" = "x86_64" ]; then
             DOCKER_PLATFORM="linux/amd64"
@@ -218,7 +234,7 @@ jobs:
             echo "Warning: /var/lib/buildkit not found; skipping disk check"
           fi
 
-          ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}")
+          ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}" --max-retries "${MAX_RETRIES}" --build-batch-size "${BUILD_BATCH_SIZE}")
           if [ -n "${INSTANCE_IDS}" ]; then
             ARGS+=(--instance-ids "${INSTANCE_IDS}")
           else
diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index c7040e7b..02ff07fb 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -4,7 +4,7 @@
 import json
 import sys
 from pathlib import Path
-from typing import Iterable
+from typing import Iterable, Iterator, List, Sequence
 
 import docker
 
@@ -81,11 +81,18 @@ def load_exec_specs(
     return specs
 
 
-def build_env_images(exec_specs: list, max_workers: int, build_mode: str) -> None:
+def build_env_images(
+    exec_specs: list,
+    max_workers: int,
+    build_mode: str,
+    max_retries: int,
+    batch_size: int,
+) -> None:
     """
     Build base + environment images required by the provided ExecSpecs.
     """
     from src.docker_build import (  # type: ignore[import-not-found]
+        BuildImageError,
         build_base_images,
         build_env_images as build_envs,
     )
@@ -99,13 +106,49 @@ def build_env_images(exec_specs: list, max_workers: int, build_mode: str) -> Non
         max_workers,
     )
     build_base_images(client, exec_specs, force_rebuild=False, build_mode=build_mode)
-    build_envs(
-        client,
-        exec_specs,
-        force_rebuild=False,
-        max_workers=max_workers,
-        build_mode=build_mode,
+    batches = list(chunked(exec_specs, max(1, batch_size)))
+    logger.info(
+        "Building env images in %s batches (batch_size=%s)", len(batches), batch_size
     )
+    for idx, batch in enumerate(batches, start=1):
+        attempt = 0
+        while True:
+            try:
+                logger.info(
+                    "Batch %s/%s: building %s env images", idx, len(batches), len(batch)
+                )
+                build_envs(
+                    client,
+                    batch,
+                    force_rebuild=False,
+                    max_workers=max_workers,
+                    build_mode=build_mode,
+                )
+                break
+            except BuildImageError as exc:
+                attempt += 1
+                if attempt > max_retries:
+                    logger.error(
+                        "Batch %s/%s failed after %s attempts: %s",
+                        idx,
+                        len(batches),
+                        max_retries,
+                        exc,
+                    )
+                    raise
+                logger.warning(
+                    "Batch %s/%s failed (attempt %s/%s): %s; retrying",
+                    idx,
+                    len(batches),
+                    attempt,
+                    max_retries,
+                    exc,
+                )
+
+
+def chunked(seq: Sequence, size: int) -> Iterator[List]:
+    for i in range(0, len(seq), size):
+        yield list(seq[i : i + size])
 
 
 def tag_and_push(images: Iterable[str], prefix: str) -> list[str]:
@@ -171,6 +214,18 @@ def main() -> None:
         default=4,
         help="Parallel builds for env images",
     )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=2,
+        help="Retries per batch for env image builds",
+    )
+    parser.add_argument(
+        "--build-batch-size",
+        type=int,
+        default=10,
+        help="Number of env images to build per batch",
+    )
     parser.add_argument(
         "--build-mode",
         choices=["api", "cli"],
@@ -210,7 +265,11 @@ def main() -> None:
         logger.info("Overrode ExecSpec architecture to %s", args.arch)
 
     build_env_images(
-        exec_specs, max_workers=args.max_workers, build_mode=args.build_mode
+        exec_specs,
+        max_workers=args.max_workers,
+        build_mode=args.build_mode,
+        max_retries=args.max_retries,
+        batch_size=args.build_batch_size,
     )
 
     base_images = {spec.base_image_key for spec in exec_specs}

From 328ef48eb8f4c747f50f1add57fc08a025d7b055 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 10:50:41 +0100
Subject: [PATCH 11/32] Remove local platform override; keep buildx
 batching/retries

---
 .github/workflows/build-swtbench-images.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index ec72a6bb..c375f519 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -211,12 +211,6 @@ jobs:
           echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV"
           echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV"
           echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV"
-          # Map to docker platform string
-          if [ "${EVAL_ARCH}" = "x86_64" ]; then
-            DOCKER_PLATFORM="linux/amd64"
-          else
-            DOCKER_PLATFORM="linux/${EVAL_ARCH}"
-          fi
 
           # Basic BuildKit disk guard similar to SWE-bench
           if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
@@ -245,7 +239,6 @@ jobs:
           fi
 
           echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
-          DOCKER_DEFAULT_PLATFORM="${DOCKER_PLATFORM}" \
           DOCKER_BUILDKIT=1 \
           BUILDKIT_PROGRESS=plain \
           BUILDKIT_RESET_ON_FAILURE=1 \

From 510277069e7c30213d8add485fcf63bb54fef4d8 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 13:48:57 +0100
Subject: [PATCH 12/32] Skip rebuilding existing swtbench images

---
 benchmarks/swtbench/build_eval_env_images.py | 59 ++++++++++++++++----
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 02ff07fb..aa96740b 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -7,6 +7,7 @@
 from typing import Iterable, Iterator, List, Sequence
 
 import docker
+from docker.errors import ImageNotFound
 
 from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
@@ -98,17 +99,47 @@ def build_env_images(
     )
 
     client = docker.from_env()
+    total_base = len({spec.base_image_key for spec in exec_specs})
+    total_env = len({spec.env_image_key for spec in exec_specs})
+
+    base_missing: dict[str, bool] = {}
+    for spec in exec_specs:
+        key = spec.base_image_key
+        if key not in base_missing:
+            base_missing[key] = not image_exists(client, key)
+    missing_base_specs = [spec for spec in exec_specs if base_missing[spec.base_image_key]]
+    skipped_base = total_base - len({spec.base_image_key for spec in missing_base_specs})
+
+    if missing_base_specs:
+        logger.info(
+            "Building %s/%s base images (skipping %s already present)",
+            len({spec.base_image_key for spec in missing_base_specs}),
+            total_base,
+            skipped_base,
+        )
+        build_base_images(
+            client, missing_base_specs, force_rebuild=False, build_mode=build_mode
+        )
+    else:
+        logger.info("All %s base images already exist; skipping base builds", total_base)
+
+    env_missing: dict[str, bool] = {}
+    for spec in exec_specs:
+        key = spec.env_image_key
+        if key not in env_missing:
+            env_missing[key] = not image_exists(client, key)
+    missing_env_specs = [spec for spec in exec_specs if env_missing[spec.env_image_key]]
+    if not missing_env_specs:
+        logger.info("All %s env images already exist; skipping env builds", total_env)
+        return
+
+    batches = list(chunked(missing_env_specs, max(1, batch_size)))
     logger.info(
-        "Building %s base images and %s env images (mode=%s, workers=%s)",
-        len({spec.base_image_key for spec in exec_specs}),
-        len({spec.env_image_key for spec in exec_specs}),
-        build_mode,
-        max_workers,
-    )
-    build_base_images(client, exec_specs, force_rebuild=False, build_mode=build_mode)
-    batches = list(chunked(exec_specs, max(1, batch_size)))
-    logger.info(
-        "Building env images in %s batches (batch_size=%s)", len(batches), batch_size
+        "Building %s/%s env images in %s batches (batch_size=%s)",
+        len({spec.env_image_key for spec in missing_env_specs}),
+        total_env,
+        len(batches),
+        batch_size,
     )
     for idx, batch in enumerate(batches, start=1):
         attempt = 0
@@ -151,6 +182,14 @@ def chunked(seq: Sequence, size: int) -> Iterator[List]:
         yield list(seq[i : i + size])
 
 
+def image_exists(client: docker.DockerClient, tag: str) -> bool:
+    try:
+        client.images.get(tag)
+        return True
+    except ImageNotFound:
+        return False
+
+
 def tag_and_push(images: Iterable[str], prefix: str) -> list[str]:
     """
     Tag the provided images with the registry prefix and push them.

From 9610efd369abfd9ca27ad686a992f921a25e5080 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 14:12:33 +0100
Subject: [PATCH 13/32] Reuse remote SWT-bench images and fix dataset cwd

---
 benchmarks/swtbench/build_eval_env_images.py | 108 ++++++++++++++-----
 1 file changed, 83 insertions(+), 25 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index aa96740b..a8cc0e21 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -2,6 +2,7 @@
 
 import argparse
 import json
+import os
 import sys
 from pathlib import Path
 from typing import Iterable, Iterator, List, Sequence
@@ -11,6 +12,7 @@
 
 from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
+from benchmarks.utils.image_utils import image_exists as remote_image_exists
 from openhands.sdk import get_logger
 
 
@@ -58,9 +60,14 @@ def load_exec_specs(
     from src.dataset import load_swebench_dataset  # type: ignore[import-not-found]
     from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
 
-    dataset_entries = load_swebench_dataset(
-        name=dataset, split=split, is_swt=False, filter_swt=filter_swt
-    )
+    cwd = os.getcwd()
+    try:
+        os.chdir(swt_bench_dir)
+        dataset_entries = load_swebench_dataset(
+            name=dataset, split=split, is_swt=False, filter_swt=filter_swt
+        )
+    finally:
+        os.chdir(cwd)
     by_id = {entry["instance_id"]: entry for entry in dataset_entries}
 
     specs = []
@@ -88,7 +95,8 @@ def build_env_images(
     build_mode: str,
     max_retries: int,
     batch_size: int,
-) -> None:
+    image_prefix: str | None,
+) -> tuple[set[str], set[str]]:
     """
     Build base + environment images required by the provided ExecSpecs.
     """
@@ -101,14 +109,50 @@ def build_env_images(
     client = docker.from_env()
     total_base = len({spec.base_image_key for spec in exec_specs})
     total_env = len({spec.env_image_key for spec in exec_specs})
+    remote_prefix = image_prefix.rstrip("/") if image_prefix else None
+
+    base_to_push: set[str] = set()
+    base_to_build_keys: set[str] = set()
+
+    def prefixed(tag: str) -> str | None:
+        return f"{remote_prefix}/{tag}" if remote_prefix else None
+
+    def ensure_local(tag: str) -> bool:
+        try:
+            client.images.get(tag)
+            return True
+        except ImageNotFound:
+            return False
 
-    base_missing: dict[str, bool] = {}
+    base_spec_by_key = {}
     for spec in exec_specs:
         key = spec.base_image_key
-        if key not in base_missing:
-            base_missing[key] = not image_exists(client, key)
-    missing_base_specs = [spec for spec in exec_specs if base_missing[spec.base_image_key]]
-    skipped_base = total_base - len({spec.base_image_key for spec in missing_base_specs})
+        base_spec_by_key.setdefault(key, spec)
+        remote_tag = prefixed(key)
+
+        if remote_tag and remote_image_exists(remote_tag):
+            logger.info("Base image %s already in registry; reusing", remote_tag)
+            if not ensure_local(key):
+                try:
+                    img = client.images.pull(remote_tag)
+                    if remote_tag != key:
+                        img.tag(key)
+                except Exception as exc:  # pragma: no cover - best effort
+                    logger.warning(
+                        "Failed to pull %s (%s); will rebuild locally", remote_tag, exc
+                    )
+                    base_to_build_keys.add(key)
+                    continue
+            continue
+
+        if ensure_local(key):
+            base_to_push.add(key)
+            continue
+
+        base_to_build_keys.add(key)
+
+    missing_base_specs = [base_spec_by_key[k] for k in base_to_build_keys]
+    skipped_base = total_base - len(base_to_build_keys)
 
     if missing_base_specs:
         logger.info(
@@ -120,18 +164,32 @@ def build_env_images(
         build_base_images(
             client, missing_base_specs, force_rebuild=False, build_mode=build_mode
         )
+        base_built = {spec.base_image_key for spec in missing_base_specs}
+        base_to_push.update(base_built)
     else:
         logger.info("All %s base images already exist; skipping base builds", total_base)
 
-    env_missing: dict[str, bool] = {}
+    env_to_push: set[str] = set()
+    missing_env_specs: list = []
+
     for spec in exec_specs:
         key = spec.env_image_key
-        if key not in env_missing:
-            env_missing[key] = not image_exists(client, key)
-    missing_env_specs = [spec for spec in exec_specs if env_missing[spec.env_image_key]]
+        remote_tag = prefixed(key)
+
+        if remote_tag and remote_image_exists(remote_tag):
+            logger.info("Env image %s already in registry; skipping build", remote_tag)
+            continue
+
+        if ensure_local(key):
+            logger.info("Env image %s already present locally; reusing", key)
+            env_to_push.add(key)
+            continue
+
+        missing_env_specs.append(spec)
+
     if not missing_env_specs:
         logger.info("All %s env images already exist; skipping env builds", total_env)
-        return
+        return base_to_push, env_to_push
 
     batches = list(chunked(missing_env_specs, max(1, batch_size)))
     logger.info(
@@ -175,6 +233,9 @@ def build_env_images(
                     max_retries,
                     exc,
                 )
+    env_to_push.update({spec.env_image_key for spec in missing_env_specs})
+
+    return base_to_push, env_to_push
 
 
 def chunked(seq: Sequence, size: int) -> Iterator[List]:
@@ -182,14 +243,6 @@ def chunked(seq: Sequence, size: int) -> Iterator[List]:
         yield list(seq[i : i + size])
 
 
-def image_exists(client: docker.DockerClient, tag: str) -> bool:
-    try:
-        client.images.get(tag)
-        return True
-    except ImageNotFound:
-        return False
-
-
 def tag_and_push(images: Iterable[str], prefix: str) -> list[str]:
     """
     Tag the provided images with the registry prefix and push them.
@@ -303,12 +356,13 @@ def main() -> None:
             spec.arch = args.arch
         logger.info("Overrode ExecSpec architecture to %s", args.arch)
 
-    build_env_images(
+    base_to_push, env_to_push = build_env_images(
         exec_specs,
         max_workers=args.max_workers,
         build_mode=args.build_mode,
         max_retries=args.max_retries,
         batch_size=args.build_batch_size,
+        image_prefix=None if args.no_push else args.image_prefix,
     )
 
     base_images = {spec.base_image_key for spec in exec_specs}
@@ -316,8 +370,12 @@ def main() -> None:
     logger.info("Built images: %s base, %s env", len(base_images), len(env_images))
 
     if not args.no_push:
-        pushed = tag_and_push(base_images | env_images, args.image_prefix)
-        logger.info("Pushed %s images", len(pushed))
+        to_push = base_to_push | env_to_push
+        if to_push:
+            pushed = tag_and_push(to_push, args.image_prefix)
+            logger.info("Pushed %s images", len(pushed))
+        else:
+            logger.info("No images need pushing; all present in registry")
 
     manifest = {
         "dataset": args.dataset,

From 0e1ddce6ce3a97b45ccc654ca2fed8fad862f378 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 14:17:04 +0100
Subject: [PATCH 14/32] Only build/push when registry missing

---
 benchmarks/swtbench/build_eval_env_images.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index a8cc0e21..ac04d83b 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -145,10 +145,6 @@ def ensure_local(tag: str) -> bool:
                     continue
             continue
 
-        if ensure_local(key):
-            base_to_push.add(key)
-            continue
-
         base_to_build_keys.add(key)
 
     missing_base_specs = [base_spec_by_key[k] for k in base_to_build_keys]
@@ -180,11 +176,6 @@ def ensure_local(tag: str) -> bool:
             logger.info("Env image %s already in registry; skipping build", remote_tag)
             continue
 
-        if ensure_local(key):
-            logger.info("Env image %s already present locally; reusing", key)
-            env_to_push.add(key)
-            continue
-
         missing_env_specs.append(spec)
 
     if not missing_env_specs:

From 7a0f1824f66aeced1e9fa0df7f44b3d865b64e25 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 14:18:16 +0100
Subject: [PATCH 15/32] Always pull remote base images; drop local fallback

---
 benchmarks/swtbench/build_eval_env_images.py | 29 +++++++-------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index ac04d83b..66380c91 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -8,7 +8,6 @@
 from typing import Iterable, Iterator, List, Sequence
 
 import docker
-from docker.errors import ImageNotFound
 
 from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
@@ -117,13 +116,6 @@ def build_env_images(
     def prefixed(tag: str) -> str | None:
         return f"{remote_prefix}/{tag}" if remote_prefix else None
 
-    def ensure_local(tag: str) -> bool:
-        try:
-            client.images.get(tag)
-            return True
-        except ImageNotFound:
-            return False
-
     base_spec_by_key = {}
     for spec in exec_specs:
         key = spec.base_image_key
@@ -132,17 +124,16 @@ def ensure_local(tag: str) -> bool:
 
         if remote_tag and remote_image_exists(remote_tag):
             logger.info("Base image %s already in registry; reusing", remote_tag)
-            if not ensure_local(key):
-                try:
-                    img = client.images.pull(remote_tag)
-                    if remote_tag != key:
-                        img.tag(key)
-                except Exception as exc:  # pragma: no cover - best effort
-                    logger.warning(
-                        "Failed to pull %s (%s); will rebuild locally", remote_tag, exc
-                    )
-                    base_to_build_keys.add(key)
-                    continue
+            try:
+                img = client.images.pull(remote_tag)
+                if remote_tag != key:
+                    img.tag(key)
+            except Exception as exc:  # pragma: no cover - best effort
+                logger.warning(
+                    "Failed to pull %s (%s); will rebuild locally", remote_tag, exc
+                )
+                base_to_build_keys.add(key)
+                continue
             continue
 
         base_to_build_keys.add(key)

From a32372954f8867422246b634ba7091f8f0f69e7b Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 15:38:45 +0100
Subject: [PATCH 16/32] Drop micromamba patch; fail if prebaked images missing

---
 benchmarks/swtbench/eval_infer.py  |  6 +---
 benchmarks/swtbench/image_utils.py | 53 ------------------------------
 2 files changed, 1 insertion(+), 58 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 4be058fb..b7b468b0 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -17,10 +17,7 @@
 import sys
 from pathlib import Path
 
-from benchmarks.swtbench.image_utils import (
-    ensure_swt_bench_repo,
-    patch_swt_bench_for_micromamba,
-)
+from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -198,7 +195,6 @@ def run_swtbench_evaluation(
 
     try:
         swt_bench_dir = ensure_swt_bench_repo()
-        patch_swt_bench_for_micromamba(swt_bench_dir)
 
         # Get the directory and filename of the predictions file
         predictions_path = Path(predictions_file).resolve()
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index f855b272..b76ac5ec 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -44,59 +44,6 @@ def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path:
     return swt_bench_dir
 
 
-def patch_swt_bench_for_micromamba(
-    swt_bench_dir: Path, solver_timeout_s: int = 300
-) -> None:
-    """
-    Patch the cached swt-bench checkout to use micromamba with timeouts when
-    building environments. Idempotent: safe to call multiple times.
-    """
-    dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py"
-    exec_spec_path = swt_bench_dir / "src" / "exec_spec.py"
-
-    if not dockerfiles_path.exists() or not exec_spec_path.exists():
-        logger.warning(
-            "swt-bench sources missing expected files; skipping micromamba patch "
-            "(dockerfiles: %s, exec_spec: %s)",
-            dockerfiles_path.exists(),
-            exec_spec_path.exists(),
-        )
-        return
-
-    dockerfiles_text = dockerfiles_path.read_text()
-    dockerfiles_updated = dockerfiles_text.replace(
-        "RUN conda config --append channels conda-forge\n\nRUN adduser",
-        "RUN conda config --append channels conda-forge\n"
-        "# Use micromamba for faster solver performance during env builds\n"
-        "RUN conda install -n base -c conda-forge -y micromamba \\\n"
-        " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n"
-        "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n"
-        "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n"
-        "RUN adduser",
-    )
-
-    exec_spec_text = exec_spec_path.read_text()
-    replacements = {
-        "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ",
-        "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ",
-        "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file",
-        "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f",
-        "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=",
-    }
-    for old, new in replacements.items():
-        exec_spec_text = exec_spec_text.replace(old, new)
-
-    if dockerfiles_text != dockerfiles_updated:
-        dockerfiles_path.write_text(dockerfiles_updated)
-        logger.info("Patched swt-bench Dockerfile template to install micromamba.")
-    if exec_spec_path.read_text() != exec_spec_text:
-        exec_spec_path.write_text(exec_spec_text)
-        logger.info(
-            "Patched swt-bench exec_spec to use micromamba with a %ss timeout.",
-            solver_timeout_s,
-        )
-
-
 def _load_instance_ids(output_jsonl: Path) -> list[str]:
     instance_ids: list[str] = []
     seen = set()

From 8ac449d22ef6e39e1a43d54cc758b0adbf64c261 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 15:58:24 +0100
Subject: [PATCH 17/32] Format with pre-commit

---
 benchmarks/swtbench/build_eval_env_images.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 66380c91..ffd684e9 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -154,7 +154,9 @@ def prefixed(tag: str) -> str | None:
         base_built = {spec.base_image_key for spec in missing_base_specs}
         base_to_push.update(base_built)
     else:
-        logger.info("All %s base images already exist; skipping base builds", total_base)
+        logger.info(
+            "All %s base images already exist; skipping base builds", total_base
+        )
 
     env_to_push: set[str] = set()
     missing_env_specs: list = []

From 1f7248a59964e368f955078df2d44d41b4e789dc Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Thu, 15 Jan 2026 17:04:38 +0100
Subject: [PATCH 18/32] Push eval images as they are built

---
 benchmarks/swtbench/build_eval_env_images.py | 28 ++++++++------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index ffd684e9..b147c112 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -95,9 +95,12 @@ def build_env_images(
     max_retries: int,
     batch_size: int,
     image_prefix: str | None,
-) -> tuple[set[str], set[str]]:
+) -> None:
     """
     Build base + environment images required by the provided ExecSpecs.
+
+    Images are pushed immediately after each successful build when image_prefix is set,
+    so partial progress is kept if the workflow fails mid-run.
     """
     from src.docker_build import (  # type: ignore[import-not-found]
         BuildImageError,
@@ -110,7 +113,6 @@ def build_env_images(
     total_env = len({spec.env_image_key for spec in exec_specs})
     remote_prefix = image_prefix.rstrip("/") if image_prefix else None
 
-    base_to_push: set[str] = set()
     base_to_build_keys: set[str] = set()
 
     def prefixed(tag: str) -> str | None:
@@ -152,13 +154,13 @@ def prefixed(tag: str) -> str | None:
             client, missing_base_specs, force_rebuild=False, build_mode=build_mode
         )
         base_built = {spec.base_image_key for spec in missing_base_specs}
-        base_to_push.update(base_built)
+        if image_prefix:
+            tag_and_push(base_built, image_prefix)
     else:
         logger.info(
             "All %s base images already exist; skipping base builds", total_base
         )
 
-    env_to_push: set[str] = set()
     missing_env_specs: list = []
 
     for spec in exec_specs:
@@ -173,7 +175,7 @@ def prefixed(tag: str) -> str | None:
 
     if not missing_env_specs:
         logger.info("All %s env images already exist; skipping env builds", total_env)
-        return base_to_push, env_to_push
+        return
 
     batches = list(chunked(missing_env_specs, max(1, batch_size)))
     logger.info(
@@ -197,6 +199,8 @@ def prefixed(tag: str) -> str | None:
                     max_workers=max_workers,
                     build_mode=build_mode,
                 )
+                if image_prefix:
+                    tag_and_push({spec.env_image_key for spec in batch}, image_prefix)
                 break
             except BuildImageError as exc:
                 attempt += 1
@@ -217,9 +221,7 @@ def prefixed(tag: str) -> str | None:
                     max_retries,
                     exc,
                 )
-    env_to_push.update({spec.env_image_key for spec in missing_env_specs})
-
-    return base_to_push, env_to_push
+    return
 
 
 def chunked(seq: Sequence, size: int) -> Iterator[List]:
@@ -340,7 +342,7 @@ def main() -> None:
             spec.arch = args.arch
         logger.info("Overrode ExecSpec architecture to %s", args.arch)
 
-    base_to_push, env_to_push = build_env_images(
+    build_env_images(
         exec_specs,
         max_workers=args.max_workers,
         build_mode=args.build_mode,
@@ -353,14 +355,6 @@ def main() -> None:
     env_images = {spec.env_image_key for spec in exec_specs}
     logger.info("Built images: %s base, %s env", len(base_images), len(env_images))
 
-    if not args.no_push:
-        to_push = base_to_push | env_to_push
-        if to_push:
-            pushed = tag_and_push(to_push, args.image_prefix)
-            logger.info("Pushed %s images", len(pushed))
-        else:
-            logger.info("No images need pushing; all present in registry")
-
     manifest = {
         "dataset": args.dataset,
         "split": args.split,

From d10418cf018c95f6688cef1ebc30618abd602529 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 16 Jan 2026 16:35:50 +0100
Subject: [PATCH 19/32] Fallback to micromamba when prebaked swtbench eval
 images missing

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swtbench/eval_infer.py  | 51 ++++++++++++++--
 benchmarks/swtbench/image_utils.py | 97 ++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 5 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index d98dbdde..1476131e 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,7 +18,10 @@
 from pathlib import Path
 from time import monotonic
 
-from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
+from benchmarks.swtbench.image_utils import (
+    ensure_swt_bench_repo,
+    pull_prebaked_eval_images,
+)
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -246,13 +249,51 @@ def run_swtbench_evaluation(
     logger.info(f"Running SWT-Bench evaluation on {predictions_file}")
 
     try:
+        predictions_path = Path(predictions_file).resolve()
+        predictions_filename = predictions_path.name
+
         swt_bench_dir = ensure_swt_bench_repo()
 
-        patch_swt_bench_for_micromamba(swt_bench_dir)
+        prebaked_ok, prebaked_details = pull_prebaked_eval_images(
+            predictions_path, dataset, split=os.getenv("SWT_BENCH_SPLIT", "test")
+        )
+        if prebaked_ok:
+            logger.info(
+                "Using prebaked SWT-Bench eval images from %s (%s pulled).",
+                prebaked_details.get("prefix"),
+                len(prebaked_details.get("pulled", [])),
+            )
+        else:
+            missing = prebaked_details.get("missing", [])
+            sample_missing = ", ".join(
+                (m.get("remote") or m.get("tag", ""))
+                + (f" [{m.get('reason')}]" if m.get("reason") else "")
+                for m in missing[:5]
+            )
+            logger.warning(
+                "Prebaked SWT-Bench eval images unavailable; falling back to micromamba builds. "
+                "prefix=%s dataset=%s split=%s required=%s missing=%s sample_missing=%s auth=%s detail=%s",
+                prebaked_details.get("prefix"),
+                dataset,
+                prebaked_details.get("split"),
+                prebaked_details.get("required_count"),
+                len(missing),
+                sample_missing or "n/a",
+                "yes" if prebaked_details.get("used_auth") else "no",
+                prebaked_details.get("error") or "missing images",
+            )
+            pull_errors = prebaked_details.get("pull_errors") or []
+            if pull_errors:
+                logger.info(
+                    "Pull/tag issues (truncated): %s",
+                    "; ".join(
+                        f"{err.get('remote')}: {err.get('reason')}"
+                        + (f" ({err.get('error')})" if err.get("error") else "")
+                        for err in pull_errors[:3]
+                    ),
+                )
 
-        # Get the directory and filename of the predictions file
-        predictions_path = Path(predictions_file).resolve()
-        predictions_filename = predictions_path.name
+            patch_swt_bench_for_micromamba(swt_bench_dir)
 
         # Copy predictions file to swt-bench directory
         swt_predictions_file = swt_bench_dir / predictions_filename
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index b76ac5ec..76f7d6fc 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -2,15 +2,18 @@
 
 import json
 import logging
+import os
 import subprocess
 import sys
 from pathlib import Path
 from typing import Iterable
 
+from benchmarks.utils.image_utils import image_exists
 from openhands.sdk import get_logger
 
 
 logger = get_logger(__name__)
+DEFAULT_EVAL_IMAGE_PREFIX = "ghcr.io/openhands/swtbench-eval"
 
 
 def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path:
@@ -126,6 +129,100 @@ def format_images_plain(images: Iterable[str]) -> str:
     return "\n".join(sorted(images))
 
 
+def _run_docker(cmd: list[str]) -> tuple[bool, str]:
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        return False, (result.stderr or result.stdout or "").strip()
+    return True, (result.stdout or "").strip()
+
+
+def pull_prebaked_eval_images(
+    predictions_file: Path,
+    dataset: str,
+    split: str,
+    *,
+    image_prefix: str | None = None,
+    gh_username: str | None = None,
+    gh_pat: str | None = None,
+) -> tuple[bool, dict]:
+    """
+    Attempt to pull prebaked SWT-bench eval base/env images from a registry.
+
+    Returns (all_available, details_dict).
+    """
+    prefix = (
+        image_prefix
+        or os.getenv("SWT_BENCH_EVAL_IMAGE_PREFIX")
+        or DEFAULT_EVAL_IMAGE_PREFIX
+    ).rstrip("/")
+    details: dict = {
+        "prefix": prefix,
+        "dataset": dataset,
+        "split": split,
+    }
+
+    if not prefix:
+        details["error"] = "empty_prefix"
+        return False, details
+
+    try:
+        base_images, env_images = compute_required_images(
+            predictions_file, dataset, split
+        )
+    except Exception as exc:  # pragma: no cover - network/FS issues
+        details["error"] = f"compute_failed: {exc}"
+        return False, details
+
+    required = sorted(base_images | env_images)
+    details["required_count"] = len(required)
+    if not required:
+        details["error"] = "no_required_images"
+        return False, details
+
+    gh_user = gh_username or os.getenv("GHCR_USERNAME") or os.getenv("GITHUB_ACTOR")
+    gh_token = gh_pat or os.getenv("GHCR_PAT")
+
+    missing: list[dict] = []
+    pulled: list[str] = []
+    pull_errors: list[dict] = []
+
+    for tag in required:
+        remote_tag = f"{prefix}/{tag}"
+        exists = image_exists(remote_tag, gh_username=gh_user, gh_pat=gh_token)
+        if not exists:
+            missing.append({"remote": remote_tag, "tag": tag, "reason": "not_found"})
+            continue
+
+        ok, err = _run_docker(["docker", "pull", remote_tag])
+        if not ok:
+            pull_errors.append(
+                {
+                    "remote": remote_tag,
+                    "tag": tag,
+                    "reason": "pull_failed",
+                    "error": err,
+                }
+            )
+            missing.append({"remote": remote_tag, "tag": tag, "reason": "pull_failed"})
+            continue
+
+        ok, err = _run_docker(["docker", "tag", remote_tag, tag])
+        if not ok:
+            pull_errors.append(
+                {"remote": remote_tag, "tag": tag, "reason": "tag_failed", "error": err}
+            )
+            missing.append({"remote": remote_tag, "tag": tag, "reason": "tag_failed"})
+            continue
+
+        pulled.append(tag)
+
+    details["missing"] = missing
+    details["pulled"] = pulled
+    details["pull_errors"] = pull_errors
+    details["used_auth"] = bool(gh_user and gh_token)
+    return len(missing) == 0, details
+
+
 def main() -> None:
     import argparse
 

From ab15ea7586bf7bdc89fff1b026685d34cb67a24a Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 16 Jan 2026 16:42:27 +0100
Subject: [PATCH 20/32] Revert "Fallback to micromamba when prebaked swtbench
 eval images missing"

This reverts commit d10418cf018c95f6688cef1ebc30618abd602529.
---
 benchmarks/swtbench/eval_infer.py  | 51 ++--------------
 benchmarks/swtbench/image_utils.py | 97 ------------------------------
 2 files changed, 5 insertions(+), 143 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 1476131e..d98dbdde 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,10 +18,7 @@
 from pathlib import Path
 from time import monotonic
 
-from benchmarks.swtbench.image_utils import (
-    ensure_swt_bench_repo,
-    pull_prebaked_eval_images,
-)
+from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -249,51 +246,13 @@ def run_swtbench_evaluation(
     logger.info(f"Running SWT-Bench evaluation on {predictions_file}")
 
     try:
-        predictions_path = Path(predictions_file).resolve()
-        predictions_filename = predictions_path.name
-
         swt_bench_dir = ensure_swt_bench_repo()
 
-        prebaked_ok, prebaked_details = pull_prebaked_eval_images(
-            predictions_path, dataset, split=os.getenv("SWT_BENCH_SPLIT", "test")
-        )
-        if prebaked_ok:
-            logger.info(
-                "Using prebaked SWT-Bench eval images from %s (%s pulled).",
-                prebaked_details.get("prefix"),
-                len(prebaked_details.get("pulled", [])),
-            )
-        else:
-            missing = prebaked_details.get("missing", [])
-            sample_missing = ", ".join(
-                (m.get("remote") or m.get("tag", ""))
-                + (f" [{m.get('reason')}]" if m.get("reason") else "")
-                for m in missing[:5]
-            )
-            logger.warning(
-                "Prebaked SWT-Bench eval images unavailable; falling back to micromamba builds. "
-                "prefix=%s dataset=%s split=%s required=%s missing=%s sample_missing=%s auth=%s detail=%s",
-                prebaked_details.get("prefix"),
-                dataset,
-                prebaked_details.get("split"),
-                prebaked_details.get("required_count"),
-                len(missing),
-                sample_missing or "n/a",
-                "yes" if prebaked_details.get("used_auth") else "no",
-                prebaked_details.get("error") or "missing images",
-            )
-            pull_errors = prebaked_details.get("pull_errors") or []
-            if pull_errors:
-                logger.info(
-                    "Pull/tag issues (truncated): %s",
-                    "; ".join(
-                        f"{err.get('remote')}: {err.get('reason')}"
-                        + (f" ({err.get('error')})" if err.get("error") else "")
-                        for err in pull_errors[:3]
-                    ),
-                )
+        patch_swt_bench_for_micromamba(swt_bench_dir)
 
-            patch_swt_bench_for_micromamba(swt_bench_dir)
+        # Get the directory and filename of the predictions file
+        predictions_path = Path(predictions_file).resolve()
+        predictions_filename = predictions_path.name
 
         # Copy predictions file to swt-bench directory
         swt_predictions_file = swt_bench_dir / predictions_filename
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index 76f7d6fc..b76ac5ec 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -2,18 +2,15 @@
 
 import json
 import logging
-import os
 import subprocess
 import sys
 from pathlib import Path
 from typing import Iterable
 
-from benchmarks.utils.image_utils import image_exists
 from openhands.sdk import get_logger
 
 
 logger = get_logger(__name__)
-DEFAULT_EVAL_IMAGE_PREFIX = "ghcr.io/openhands/swtbench-eval"
 
 
 def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path:
@@ -129,100 +126,6 @@ def format_images_plain(images: Iterable[str]) -> str:
     return "\n".join(sorted(images))
 
 
-def _run_docker(cmd: list[str]) -> tuple[bool, str]:
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        return False, (result.stderr or result.stdout or "").strip()
-    return True, (result.stdout or "").strip()
-
-
-def pull_prebaked_eval_images(
-    predictions_file: Path,
-    dataset: str,
-    split: str,
-    *,
-    image_prefix: str | None = None,
-    gh_username: str | None = None,
-    gh_pat: str | None = None,
-) -> tuple[bool, dict]:
-    """
-    Attempt to pull prebaked SWT-bench eval base/env images from a registry.
-
-    Returns (all_available, details_dict).
-    """
-    prefix = (
-        image_prefix
-        or os.getenv("SWT_BENCH_EVAL_IMAGE_PREFIX")
-        or DEFAULT_EVAL_IMAGE_PREFIX
-    ).rstrip("/")
-    details: dict = {
-        "prefix": prefix,
-        "dataset": dataset,
-        "split": split,
-    }
-
-    if not prefix:
-        details["error"] = "empty_prefix"
-        return False, details
-
-    try:
-        base_images, env_images = compute_required_images(
-            predictions_file, dataset, split
-        )
-    except Exception as exc:  # pragma: no cover - network/FS issues
-        details["error"] = f"compute_failed: {exc}"
-        return False, details
-
-    required = sorted(base_images | env_images)
-    details["required_count"] = len(required)
-    if not required:
-        details["error"] = "no_required_images"
-        return False, details
-
-    gh_user = gh_username or os.getenv("GHCR_USERNAME") or os.getenv("GITHUB_ACTOR")
-    gh_token = gh_pat or os.getenv("GHCR_PAT")
-
-    missing: list[dict] = []
-    pulled: list[str] = []
-    pull_errors: list[dict] = []
-
-    for tag in required:
-        remote_tag = f"{prefix}/{tag}"
-        exists = image_exists(remote_tag, gh_username=gh_user, gh_pat=gh_token)
-        if not exists:
-            missing.append({"remote": remote_tag, "tag": tag, "reason": "not_found"})
-            continue
-
-        ok, err = _run_docker(["docker", "pull", remote_tag])
-        if not ok:
-            pull_errors.append(
-                {
-                    "remote": remote_tag,
-                    "tag": tag,
-                    "reason": "pull_failed",
-                    "error": err,
-                }
-            )
-            missing.append({"remote": remote_tag, "tag": tag, "reason": "pull_failed"})
-            continue
-
-        ok, err = _run_docker(["docker", "tag", remote_tag, tag])
-        if not ok:
-            pull_errors.append(
-                {"remote": remote_tag, "tag": tag, "reason": "tag_failed", "error": err}
-            )
-            missing.append({"remote": remote_tag, "tag": tag, "reason": "tag_failed"})
-            continue
-
-        pulled.append(tag)
-
-    details["missing"] = missing
-    details["pulled"] = pulled
-    details["pull_errors"] = pull_errors
-    details["used_auth"] = bool(gh_user and gh_token)
-    return len(missing) == 0, details
-
-
 def main() -> None:
     import argparse
 

From af6d559c6e25577a9c1a6960d7b553063db8cddf Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 16 Jan 2026 16:44:44 +0100
Subject: [PATCH 21/32] Remove unused micromamba patching from swtbench eval

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/swtbench/eval_infer.py | 53 -------------------------------
 1 file changed, 53 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index d98dbdde..27d68e3e 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -28,57 +28,6 @@
 logger = get_logger(__name__)
 
 
-def patch_swt_bench_for_micromamba(swt_bench_dir: Path) -> None:
-    """
-    Ensure the cached swt-bench checkout uses micromamba for env creation.
-    Applies small, idempotent text replacements to the upstream sources.
-    """
-    solver_timeout_s = 600
-    dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py"
-    exec_spec_path = swt_bench_dir / "src" / "exec_spec.py"
-
-    if not dockerfiles_path.exists() or not exec_spec_path.exists():
-        logger.warning(
-            "swt-bench sources missing expected files; skipping micromamba patch "
-            f"(dockerfiles: {dockerfiles_path.exists()}, exec_spec: {exec_spec_path.exists()})"
-        )
-        return
-
-    dockerfiles_text = dockerfiles_path.read_text()
-    dockerfiles_updated = dockerfiles_text.replace(
-        "RUN conda config --append channels conda-forge\n\nRUN adduser",
-        "RUN conda config --append channels conda-forge\n"
-        "# Use micromamba for faster solver performance during env builds\n"
-        "RUN conda install -n base -c conda-forge -y micromamba \\\n"
-        " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n"
-        "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n"
-        "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n"
-        "RUN adduser",
-    )
-
-    exec_spec_text = exec_spec_path.read_text()
-    replacements = {
-        "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ",
-        "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ",
-        "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file",
-        "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f",
-        "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=",
-    }
-    for old, new in replacements.items():
-        exec_spec_text = exec_spec_text.replace(old, new)
-
-    if dockerfiles_text != dockerfiles_updated:
-        dockerfiles_path.write_text(dockerfiles_updated)
-        logger.info("Patched swt-bench Dockerfile template to install micromamba.")
-    if exec_spec_path.read_text() != exec_spec_text:
-        exec_spec_path.write_text(exec_spec_text)
-        logger.info(
-            "Patched swt-bench exec_spec to create/update envs with micromamba "
-            "and a %ss timeout on solver calls.",
-            solver_timeout_s,
-        )
-
-
 def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
     instance_ids: list[str] = []
     seen = set()
@@ -248,8 +197,6 @@ def run_swtbench_evaluation(
     try:
         swt_bench_dir = ensure_swt_bench_repo()
 
-        patch_swt_bench_for_micromamba(swt_bench_dir)
-
         # Get the directory and filename of the predictions file
         predictions_path = Path(predictions_file).resolve()
         predictions_filename = predictions_path.name

From 1531ce5d90dc5733ef68bd27555d4ab0f3edf5e5 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 16 Jan 2026 17:08:59 +0100
Subject: [PATCH 22/32] Remove unused eval arch override path for swtbench
 prebaked images

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swtbench-images.yml  | 12 ------------
 benchmarks/swtbench/build_eval_env_images.py | 13 +------------
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index c375f519..f48587ae 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -44,14 +44,6 @@ on:
         required: false
         default: 'ghcr.io/openhands/swtbench-eval'
         type: string
-      eval-arch:
-        description: 'Architecture for prebaked eval images'
-        required: false
-        default: 'x86_64'
-        type: choice
-        options:
-          - x86_64
-          - arm64
       max-retries:
         description: 'Retries per batch for eval env builds'
         required: false
@@ -202,7 +194,6 @@ jobs:
           N_LIMIT="${{ inputs.n-limit || '0' }}"
           INSTANCE_IDS="${{ inputs.instance-ids }}"
           IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}"
-          EVAL_ARCH="${{ inputs.eval-arch || 'x86_64' }}"
           MAX_WORKERS="${{ inputs.max-workers || '4' }}"
           BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
           MAX_RETRIES="${{ inputs.max-retries || '2' }}"
@@ -234,9 +225,6 @@ jobs:
           else
             ARGS+=(--eval-limit "${N_LIMIT}")
           fi
-          if [ -n "${EVAL_ARCH}" ]; then
-            ARGS+=(--arch "${EVAL_ARCH}")
-          fi
 
           echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
           DOCKER_BUILDKIT=1 \
diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index b147c112..079ad66c 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -280,12 +280,6 @@ def main() -> None:
         default="ghcr.io/openhands/swtbench-eval",
         help="Registry prefix for pushed images",
     )
-    parser.add_argument(
-        "--arch",
-        choices=["x86_64", "arm64", ""],
-        default="",
-        help="Force architecture for built images (defaults to host arch)",
-    )
     parser.add_argument(
         "--max-workers",
         type=int,
@@ -337,11 +331,6 @@ def main() -> None:
     exec_specs = load_exec_specs(
         swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True
     )
-    if args.arch:
-        for spec in exec_specs:
-            spec.arch = args.arch
-        logger.info("Overrode ExecSpec architecture to %s", args.arch)
-
     build_env_images(
         exec_specs,
         max_workers=args.max_workers,
@@ -362,7 +351,7 @@ def main() -> None:
         "base_images": sorted(base_images),
         "env_images": sorted(env_images),
         "image_prefix": args.image_prefix,
-        "arch": args.arch or "host",
+        "arch": "host",
     }
     print(json.dumps(manifest, indent=2))
 

From b890251275f60c5b8b2b070342837a1d86c411c9 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 12:41:28 +0100
Subject: [PATCH 23/32] Add prebaked image pull fallback and force conda solver

---
 benchmarks/swtbench/eval_infer.py | 97 ++++++++++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 27d68e3e..2b8684a1 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,7 +18,10 @@
 from pathlib import Path
 from time import monotonic
 
-from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
+from benchmarks.swtbench.image_utils import (
+    compute_required_images,
+    ensure_swt_bench_repo,
+)
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -27,6 +30,8 @@
 
 logger = get_logger(__name__)
 
+PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval"
+
 
 def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
     instance_ids: list[str] = []
@@ -59,6 +64,65 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
     return instance_ids
 
 
+def try_pull_prebaked_images(
+    predictions_file: Path,
+    dataset: str,
+    split: str = "test",
+    registry: str = PREBAKED_REGISTRY,
+    *,
+    filter_swt: bool = True,
+    is_swt: bool = True,
+) -> None:
+    """
+    Best-effort pull of prebaked base/env images; no-op on failure.
+    """
+    try:
+        base_images, env_images = compute_required_images(
+            predictions_file,
+            dataset,
+            split,
+            filter_swt=filter_swt,
+            is_swt=is_swt,
+        )
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning("Skipping prebaked image pull (compute failed): %s", exc)
+        return
+
+    tags = sorted(base_images | env_images)
+    if not tags:
+        logger.info("No prebaked images to pull (empty tag set)")
+        return
+
+    registry = registry.rstrip("/")
+    for tag in tags:
+        remote = f"{registry}/{tag}"
+        logger.info("Attempting to pull prebaked image %s", remote)
+        try:
+            pull = subprocess.run(
+                ["docker", "pull", remote],
+                capture_output=True,
+                text=True,
+            )
+        except FileNotFoundError:
+            logger.warning("Docker not available; skipping prebaked image pull")
+            return
+
+        if pull.returncode != 0:
+            logger.warning("Failed to pull %s: %s", remote, pull.stderr.strip())
+            continue
+
+        # Tag the remote image with the local name expected by the harness.
+        tag_res = subprocess.run(
+            ["docker", "tag", remote, tag],
+            capture_output=True,
+            text=True,
+        )
+        if tag_res.returncode != 0:
+            logger.warning("Failed to tag %s as %s: %s", remote, tag, tag_res.stderr)
+        else:
+            logger.info("Pulled and tagged %s -> %s", remote, tag)
+
+
 def update_report_with_submitted_instances(
     report_path: Path, predictions_path: Path
 ) -> None:
@@ -228,6 +292,8 @@ def run_swtbench_evaluation(
         # Set up environment with PYTHONPATH to include swt-bench directory
         env = os.environ.copy()
         env["PYTHONPATH"] = str(swt_bench_dir)
+        # Force classic conda solver (avoid libmamba plugin issues)
+        env.setdefault("CONDA_SOLVER", "classic")
 
         cmd = [
             python_executable,
@@ -302,6 +368,12 @@ def main() -> None:
         "(default: eth-sri/SWT-bench_Verified_bm25_27k_zsp)",
     )
 
+    parser.add_argument(
+        "--dataset-split",
+        default="test",
+        help="Dataset split to use when computing prebaked images (default: test)",
+    )
+
     parser.add_argument(
         "--output-file",
         help="Output file for SWT-Bench format "
@@ -326,6 +398,19 @@ def main() -> None:
         help="Number of workers to use when evaluating",
     )
 
+    parser.add_argument(
+        "--no-prebaked-pull",
+        action="store_true",
+        help="Skip pulling prebaked GHCR SWT-Bench images before evaluation",
+    )
+
+    parser.add_argument(
+        "--prebaked-registry",
+        default=PREBAKED_REGISTRY,
+        help="Registry prefix for prebaked SWT-Bench images "
+        f"(default: {PREBAKED_REGISTRY})",
+    )
+
     args = parser.parse_args()
 
     # Validate input file
@@ -346,12 +431,22 @@ def main() -> None:
     logger.info(f"Input file: {input_file}")
     logger.info(f"Output file: {output_file}")
     logger.info(f"Dataset: {args.dataset}")
+    logger.info(f"Dataset split: {args.dataset_split}")
     logger.info(f"Model name: {args.model_name}")
 
     try:
         # Convert format
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
+        if not args.no_prebaked_pull:
+            try_pull_prebaked_images(
+                output_file,
+                args.dataset,
+                split=args.dataset_split,
+                registry=args.prebaked_registry,
+                is_swt=True,
+            )
+
         if not args.skip_evaluation:
             eval_phase_start = monotonic()
             # Run evaluation

From 9daea9f23fffb6cca3560c19ceae52d75489b112 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 12:42:41 +0100
Subject: [PATCH 24/32] Simplify prebaked toggle via env vars

---
 benchmarks/swtbench/eval_infer.py | 37 ++++++++++++-------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 2b8684a1..9fec69c6 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -368,12 +368,6 @@ def main() -> None:
         "(default: eth-sri/SWT-bench_Verified_bm25_27k_zsp)",
     )
 
-    parser.add_argument(
-        "--dataset-split",
-        default="test",
-        help="Dataset split to use when computing prebaked images (default: test)",
-    )
-
     parser.add_argument(
         "--output-file",
         help="Output file for SWT-Bench format "
@@ -398,19 +392,6 @@ def main() -> None:
         help="Number of workers to use when evaluating",
     )
 
-    parser.add_argument(
-        "--no-prebaked-pull",
-        action="store_true",
-        help="Skip pulling prebaked GHCR SWT-Bench images before evaluation",
-    )
-
-    parser.add_argument(
-        "--prebaked-registry",
-        default=PREBAKED_REGISTRY,
-        help="Registry prefix for prebaked SWT-Bench images "
-        f"(default: {PREBAKED_REGISTRY})",
-    )
-
     args = parser.parse_args()
 
     # Validate input file
@@ -431,21 +412,31 @@ def main() -> None:
     logger.info(f"Input file: {input_file}")
     logger.info(f"Output file: {output_file}")
     logger.info(f"Dataset: {args.dataset}")
-    logger.info(f"Dataset split: {args.dataset_split}")
     logger.info(f"Model name: {args.model_name}")
 
     try:
         # Convert format
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
-        if not args.no_prebaked_pull:
+        # Default: attempt to use prebaked images; allow opting out via env.
+        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
+            "1",
+            "true",
+            "yes",
+        )
+        prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY)
+        prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test")
+
+        if use_prebaked:
             try_pull_prebaked_images(
                 output_file,
                 args.dataset,
-                split=args.dataset_split,
-                registry=args.prebaked_registry,
+                split=prebaked_split,
+                registry=prebaked_registry,
                 is_swt=True,
             )
+        else:
+            logger.info("SWTBENCH_FORCE_CONDA set; skipping prebaked image pull")
 
         if not args.skip_evaluation:
             eval_phase_start = monotonic()

From 9d0a410cda1dc9ab75a670ffa7381d619a157a9d Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 15:59:41 +0100
Subject: [PATCH 25/32] Add legacy opt-out path for swtbench prebaked pull

---
 benchmarks/swtbench/eval_infer.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 9fec69c6..aab7c617 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -242,6 +242,7 @@ def run_swtbench_evaluation(
     predictions_file: str,
     dataset: str = "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
     workers: str = "12",
+    use_legacy: bool = False,
 ) -> None:
     """
     Run SWT-Bench evaluation on the predictions file.
@@ -256,7 +257,10 @@ def run_swtbench_evaluation(
         dataset: SWT-Bench dataset to evaluate against
         workers: Number of workers to use for evaluation
     """
-    logger.info(f"Running SWT-Bench evaluation on {predictions_file}")
+    mode = "legacy-conda" if use_legacy else "prebaked-images"
+    logger.info(
+        "Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode
+    )
 
     try:
         swt_bench_dir = ensure_swt_bench_repo()
@@ -292,8 +296,6 @@ def run_swtbench_evaluation(
         # Set up environment with PYTHONPATH to include swt-bench directory
         env = os.environ.copy()
         env["PYTHONPATH"] = str(swt_bench_dir)
-        # Force classic conda solver (avoid libmamba plugin issues)
-        env.setdefault("CONDA_SOLVER", "classic")
 
         cmd = [
             python_executable,
@@ -419,11 +421,12 @@ def main() -> None:
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
         # Default: attempt to use prebaked images; allow opting out via env.
-        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
+        force_conda = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in (
             "1",
             "true",
             "yes",
         )
+        use_prebaked = not force_conda
         prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY)
         prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test")
 
@@ -436,12 +439,20 @@ def main() -> None:
                 is_swt=True,
             )
         else:
-            logger.info("SWTBENCH_FORCE_CONDA set; skipping prebaked image pull")
+            logger.info(
+                "SWTBENCH_FORCE_CONDA set; skipping prebaked image pull "
+                "and using legacy (pre-mamba) evaluation flow"
+            )
 
         if not args.skip_evaluation:
             eval_phase_start = monotonic()
             # Run evaluation
-            run_swtbench_evaluation(str(output_file), args.dataset, args.workers)
+            run_swtbench_evaluation(
+                str(output_file),
+                args.dataset,
+                args.workers,
+                use_legacy=force_conda,
+            )
             eval_phase_end = monotonic()
 
             cleanup_phase_start = monotonic()

From b7c608e0211209dc12e210f771ccaf52196aa80c Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 16:01:48 +0100
Subject: [PATCH 26/32] Simplify prebaked toggle env handling

---
 benchmarks/swtbench/eval_infer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index aab7c617..84b00487 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -420,13 +420,12 @@ def main() -> None:
         # Convert format
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
-        # Default: attempt to use prebaked images; allow opting out via env.
-        force_conda = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in (
+        # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow.
+        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
             "1",
             "true",
             "yes",
         )
-        use_prebaked = not force_conda
         prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY)
         prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test")
 
@@ -451,7 +450,7 @@ def main() -> None:
                 str(output_file),
                 args.dataset,
                 args.workers,
-                use_legacy=force_conda,
+                use_legacy=not use_prebaked,
             )
             eval_phase_end = monotonic()
 

From 5167fbdc55ab8d46809e1070e023915c0b2cdd8d Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 16:05:20 +0100
Subject: [PATCH 27/32] Hardcode prebaked params and simplify toggle

---
 benchmarks/swtbench/eval_infer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 84b00487..99ee4e98 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -421,13 +421,9 @@ def main() -> None:
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
         # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow.
-        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
-            "1",
-            "true",
-            "yes",
-        )
-        prebaked_registry = os.getenv("SWTBENCH_PREBAKED_REGISTRY", PREBAKED_REGISTRY)
-        prebaked_split = os.getenv("SWTBENCH_DATASET_SPLIT", "test")
+        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ("1", "true", "yes")
+        prebaked_registry = PREBAKED_REGISTRY
+        prebaked_split = "test"
 
         if use_prebaked:
             try_pull_prebaked_images(

From 1a5a7da8ec9523711249f4b22f449b1c78bbb017 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 16:08:37 +0100
Subject: [PATCH 28/32] Read legacy toggle from env in eval runner

---
 benchmarks/swtbench/eval_infer.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 99ee4e98..04fcf541 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -242,7 +242,6 @@ def run_swtbench_evaluation(
     predictions_file: str,
     dataset: str = "eth-sri/SWT-bench_Verified_bm25_27k_zsp",
     workers: str = "12",
-    use_legacy: bool = False,
 ) -> None:
     """
     Run SWT-Bench evaluation on the predictions file.
@@ -257,6 +256,7 @@ def run_swtbench_evaluation(
         dataset: SWT-Bench dataset to evaluate against
         workers: Number of workers to use for evaluation
     """
+    use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes")
     mode = "legacy-conda" if use_legacy else "prebaked-images"
     logger.info(
         "Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode
@@ -442,12 +442,7 @@ def main() -> None:
         if not args.skip_evaluation:
             eval_phase_start = monotonic()
             # Run evaluation
-            run_swtbench_evaluation(
-                str(output_file),
-                args.dataset,
-                args.workers,
-                use_legacy=not use_prebaked,
-            )
+            run_swtbench_evaluation(str(output_file), args.dataset, args.workers)
             eval_phase_end = monotonic()
 
             cleanup_phase_start = monotonic()

From 1e6f82fad60e6111c65671d51df021b7342680a3 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 16:25:55 +0100
Subject: [PATCH 29/32] Default SWT image computation to SWT dataset

---
 benchmarks/swtbench/image_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index b76ac5ec..933d78ed 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -71,7 +71,6 @@ def compute_required_images(
     split: str,
     *,
     filter_swt: bool = True,
-    is_swt: bool = False,
 ) -> tuple[set[str], set[str]]:
     """
     Compute the base/env image tags required to evaluate the given predictions file.
@@ -91,7 +90,7 @@ def compute_required_images(
     from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
 
     dataset_entries = load_swebench_dataset(
-        name=dataset, split=split, is_swt=is_swt, filter_swt=filter_swt
+        name=dataset, split=split, is_swt=True, filter_swt=filter_swt
     )
     entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries}
 

From 8974cdfb1cfc97da32c0d38ee35b3488038596a6 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 16:28:48 +0100
Subject: [PATCH 30/32] Hide SWT filter flag and hardcode SWT dataset mode

---
 benchmarks/swtbench/eval_infer.py  |  6 ------
 benchmarks/swtbench/image_utils.py | 10 +---------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 04fcf541..bdd67ac4 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -69,9 +69,6 @@ def try_pull_prebaked_images(
     dataset: str,
     split: str = "test",
     registry: str = PREBAKED_REGISTRY,
-    *,
-    filter_swt: bool = True,
-    is_swt: bool = True,
 ) -> None:
     """
     Best-effort pull of prebaked base/env images; no-op on failure.
@@ -81,8 +78,6 @@ def try_pull_prebaked_images(
             predictions_file,
             dataset,
             split,
-            filter_swt=filter_swt,
-            is_swt=is_swt,
         )
     except Exception as exc:  # pragma: no cover - defensive
         logger.warning("Skipping prebaked image pull (compute failed): %s", exc)
@@ -431,7 +426,6 @@ def main() -> None:
                 args.dataset,
                 split=prebaked_split,
                 registry=prebaked_registry,
-                is_swt=True,
             )
         else:
             logger.info(
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
index 933d78ed..e7aae1f4 100644
--- a/benchmarks/swtbench/image_utils.py
+++ b/benchmarks/swtbench/image_utils.py
@@ -69,8 +69,6 @@ def compute_required_images(
     output_jsonl: Path,
     dataset: str,
     split: str,
-    *,
-    filter_swt: bool = True,
 ) -> tuple[set[str], set[str]]:
     """
     Compute the base/env image tags required to evaluate the given predictions file.
@@ -90,7 +88,7 @@ def compute_required_images(
     from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
 
     dataset_entries = load_swebench_dataset(
-        name=dataset, split=split, is_swt=True, filter_swt=filter_swt
+        name=dataset, split=split, is_swt=True, filter_swt=True
     )
     entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries}
 
@@ -134,11 +132,6 @@ def main() -> None:
     parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl")
     parser.add_argument("--dataset", required=True, help="Dataset name")
     parser.add_argument("--split", default="test", help="Dataset split")
-    parser.add_argument(
-        "--no-filter-swt",
-        action="store_true",
-        help="Disable SWT filtering when loading the dataset",
-    )
     parser.add_argument(
         "--format",
         choices=["plain", "json"],
@@ -151,7 +144,6 @@ def main() -> None:
         args.output_jsonl,
         args.dataset,
         args.split,
-        filter_swt=not args.no_filter_swt,
     )
     payload = {
         "base": sorted(base_images),

From bde150201a48461cfaf83c98c64bcbcd9a4ca639 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 16:32:00 +0100
Subject: [PATCH 31/32] Use prebaked pull defaults

---
 benchmarks/swtbench/eval_infer.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index bdd67ac4..db364a85 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -417,15 +417,10 @@ def main() -> None:
 
         # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow.
         use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ("1", "true", "yes")
-        prebaked_registry = PREBAKED_REGISTRY
-        prebaked_split = "test"
-
         if use_prebaked:
             try_pull_prebaked_images(
                 output_file,
                 args.dataset,
-                split=prebaked_split,
-                registry=prebaked_registry,
             )
         else:
             logger.info(

From 968c4b38277dccff37411274f01ab08431d57bb2 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 17 Jan 2026 17:43:25 +0100
Subject: [PATCH 32/32] Run pre-commit formatting

---
 benchmarks/swtbench/eval_infer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index db364a85..9dc7062b 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -253,9 +253,7 @@ def run_swtbench_evaluation(
     """
     use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes")
     mode = "legacy-conda" if use_legacy else "prebaked-images"
-    logger.info(
-        "Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode
-    )
+    logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode)
 
     try:
         swt_bench_dir = ensure_swt_bench_repo()
@@ -416,7 +414,11 @@ def main() -> None:
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
         # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow.
-        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in ("1", "true", "yes")
+        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
+            "1",
+            "true",
+            "yes",
+        )
         if use_prebaked:
             try_pull_prebaked_images(
                 output_file,