diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index 1ca70c1b..f48587ae 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -34,6 +34,26 @@ on:
         description: 'Software Agent SDK commit/ref to use'
         required: true
         type: string
+      build-eval-env:
+        description: 'Also build prebaked SWT-bench eval env images (default: false)'
+        required: false
+        default: 'false'
+        type: string
+      eval-image-prefix:
+        description: 'Registry prefix for prebaked eval images'
+        required: false
+        default: 'ghcr.io/openhands/swtbench-eval'
+        type: string
+      max-retries:
+        description: 'Retries per batch for eval env builds'
+        required: false
+        default: '2'
+        type: string
+      build-batch-size:
+        description: 'Env images per batch for eval env builds'
+        required: false
+        default: '10'
+        type: string
 
 concurrency:
   group: build-swt-bench-${{ github.ref }}
@@ -158,6 +178,64 @@ jobs:
           DOCKER_BUILDKIT: 1
           BUILDKIT_PROGRESS: plain
 
+      - name: Build prebaked eval env images
+        if: ${{ inputs.build-eval-env == 'true' }}
+        run: |
+          set -euo pipefail
+
+          echo "Starting prebaked eval env image build at $(date -u)"
+          echo "Runner: $(uname -a)"
+          df -h
+          docker system df || true
+          docker info || true
+
+          DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}"
+          SPLIT="${{ inputs.split || 'test' }}"
+          N_LIMIT="${{ inputs.n-limit || '0' }}"
+          INSTANCE_IDS="${{ inputs.instance-ids }}"
+          IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}"
+          MAX_WORKERS="${{ inputs.max-workers || '4' }}"
+          BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
+          MAX_RETRIES="${{ inputs.max-retries || '2' }}"
+          BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}"
+
+          echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV"
+          echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV"
+          echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV"
+
+          # Basic BuildKit disk guard similar to SWE-bench
+          if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
+            LINE=$(tail -n1 /tmp/buildkit_df)
+            TOTAL=$(echo "$LINE" | awk '{print $2}')
+            USED=$(echo "$LINE" | awk '{print $3}')
+            FREE=$(echo "$LINE" | awk '{print $4}')
+            if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
+              PCT=$(( 100 * USED / TOTAL ))
+              echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
+            else
+              echo "Warning: unable to parse df output for /var/lib/buildkit"
+            fi
+          else
+            echo "Warning: /var/lib/buildkit not found; skipping disk check"
+          fi
+
+          ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}" --max-retries "${MAX_RETRIES}" --build-batch-size "${BUILD_BATCH_SIZE}")
+          if [ -n "${INSTANCE_IDS}" ]; then
+            ARGS+=(--instance-ids "${INSTANCE_IDS}")
+          else
+            ARGS+=(--eval-limit "${N_LIMIT}")
+          fi
+
+          echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
+          DOCKER_BUILDKIT=1 \
+          BUILDKIT_PROGRESS=plain \
+          BUILDKIT_RESET_ON_FAILURE=1 \
+          PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log
+
+          echo "Completed prebaked eval env image build at $(date -u)"
+          docker ps -a || true
+          docker system df || true
+
       - name: Archive build logs
         if: always()
         run: |
@@ -199,6 +277,23 @@ jobs:
           echo "**Successful:** $SUCCESS_COUNT ✅" >> "$GITHUB_STEP_SUMMARY"
           echo "**Failed:** $FAIL_COUNT ❌" >> "$GITHUB_STEP_SUMMARY"
 
+      - name: Make prebaked eval image package public (best-effort)
+        if: ${{ inputs.build-eval-env == 'true' }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          IMAGE_PREFIX: ${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}
+        run: |
+          set -euo pipefail
+          NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}')
+          if [ -z "$NAME" ]; then
+            echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update"
+            exit 0
+          fi
+          gh api -X PATCH \
+            -H "Accept: application/vnd.github+json" \
+            /orgs/OpenHands/packages/container/${NAME}/visibility \
+            -f visibility=public || echo "Warning: failed to set package visibility"
+
       - name: Comment on tracker issue
         if: success()
         run: |
diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
new file mode 100644
index 00000000..079ad66c
--- /dev/null
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -0,0 +1,360 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Iterable, Iterator, List, Sequence
+
+import docker
+
+from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
+from benchmarks.utils.dataset import get_dataset
+from benchmarks.utils.image_utils import image_exists as remote_image_exists
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def select_instance_ids(
+    dataset: str,
+    split: str,
+    eval_limit: int | None,
+    selected_instances_file: str | None,
+    instance_ids: list[str] | None,
+) -> list[str]:
+    """
+    Select the instance IDs that match the inference sampling logic.
+    """
+    if instance_ids:
+        return instance_ids
+
+    df = get_dataset(
+        dataset_name=dataset,
+        split=split,
+        eval_limit=eval_limit,
+        selected_instances_file=selected_instances_file,
+    )
+    ids = df["instance_id"].tolist()
+    if not ids:
+        raise RuntimeError("No instances selected for image build.")
+    logger.info("Selected %s instances for image build", len(ids))
+    return ids
+
+
+def load_exec_specs(
+    swt_bench_dir: Path,
+    dataset: str,
+    split: str,
+    instance_ids: Iterable[str],
+    filter_swt: bool = True,
+) -> list:
+    """
+    Load ExecSpec objects for the provided instance IDs.
+    """
+    sys.path.insert(0, str(swt_bench_dir / "src"))
+    sys.path.insert(0, str(swt_bench_dir))
+    from src.dataset import load_swebench_dataset  # type: ignore[import-not-found]
+    from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
+
+    cwd = os.getcwd()
+    try:
+        os.chdir(swt_bench_dir)
+        dataset_entries = load_swebench_dataset(
+            name=dataset, split=split, is_swt=False, filter_swt=filter_swt
+        )
+    finally:
+        os.chdir(cwd)
+    by_id = {entry["instance_id"]: entry for entry in dataset_entries}
+
+    specs = []
+    missing = []
+    for iid in instance_ids:
+        if iid not in by_id:
+            missing.append(iid)
+            continue
+        specs.append(make_exec_spec(by_id[iid]))
+
+    if missing:
+        logger.warning(
+            "Skipped %s missing instance_ids not found in dataset: %s",
+            len(missing),
+            ", ".join(missing[:5]),
+        )
+    if not specs:
+        raise RuntimeError("No ExecSpecs available after filtering instance IDs.")
+    return specs
+
+
+def build_env_images(
+    exec_specs: list,
+    max_workers: int,
+    build_mode: str,
+    max_retries: int,
+    batch_size: int,
+    image_prefix: str | None,
+) -> None:
+    """
+    Build base + environment images required by the provided ExecSpecs.
+
+    Images are pushed immediately after each successful build when image_prefix is set,
+    so partial progress is kept if the workflow fails mid-run.
+    """
+    from src.docker_build import (  # type: ignore[import-not-found]
+        BuildImageError,
+        build_base_images,
+        build_env_images as build_envs,
+    )
+
+    client = docker.from_env()
+    total_base = len({spec.base_image_key for spec in exec_specs})
+    total_env = len({spec.env_image_key for spec in exec_specs})
+    remote_prefix = image_prefix.rstrip("/") if image_prefix else None
+
+    base_to_build_keys: set[str] = set()
+
+    def prefixed(tag: str) -> str | None:
+        return f"{remote_prefix}/{tag}" if remote_prefix else None
+
+    base_spec_by_key = {}
+    for spec in exec_specs:
+        key = spec.base_image_key
+        base_spec_by_key.setdefault(key, spec)
+        remote_tag = prefixed(key)
+
+        if remote_tag and remote_image_exists(remote_tag):
+            logger.info("Base image %s already in registry; reusing", remote_tag)
+            try:
+                img = client.images.pull(remote_tag)
+                if remote_tag != key:
+                    img.tag(key)
+            except Exception as exc:  # pragma: no cover - best effort
+                logger.warning(
+                    "Failed to pull %s (%s); will rebuild locally", remote_tag, exc
+                )
+                base_to_build_keys.add(key)
+                continue
+            continue
+
+        base_to_build_keys.add(key)
+
+    missing_base_specs = [base_spec_by_key[k] for k in base_to_build_keys]
+    skipped_base = total_base - len(base_to_build_keys)
+
+    if missing_base_specs:
+        logger.info(
+            "Building %s/%s base images (skipping %s already present)",
+            len({spec.base_image_key for spec in missing_base_specs}),
+            total_base,
+            skipped_base,
+        )
+        build_base_images(
+            client, missing_base_specs, force_rebuild=False, build_mode=build_mode
+        )
+        base_built = {spec.base_image_key for spec in missing_base_specs}
+        if image_prefix:
+            tag_and_push(base_built, image_prefix)
+    else:
+        logger.info(
+            "All %s base images already exist; skipping base builds", total_base
+        )
+
+    missing_env_specs: list = []
+
+    for spec in exec_specs:
+        key = spec.env_image_key
+        remote_tag = prefixed(key)
+
+        if remote_tag and remote_image_exists(remote_tag):
+            logger.info("Env image %s already in registry; skipping build", remote_tag)
+            continue
+
+        missing_env_specs.append(spec)
+
+    if not missing_env_specs:
+        logger.info("All %s env images already exist; skipping env builds", total_env)
+        return
+
+    batches = list(chunked(missing_env_specs, max(1, batch_size)))
+    logger.info(
+        "Building %s/%s env images in %s batches (batch_size=%s)",
+        len({spec.env_image_key for spec in missing_env_specs}),
+        total_env,
+        len(batches),
+        batch_size,
+    )
+    for idx, batch in enumerate(batches, start=1):
+        attempt = 0
+        while True:
+            try:
+                logger.info(
+                    "Batch %s/%s: building %s env images", idx, len(batches), len(batch)
+                )
+                build_envs(
+                    client,
+                    batch,
+                    force_rebuild=False,
+                    max_workers=max_workers,
+                    build_mode=build_mode,
+                )
+                if image_prefix:
+                    tag_and_push({spec.env_image_key for spec in batch}, image_prefix)
+                break
+            except BuildImageError as exc:
+                attempt += 1
+                if attempt > max_retries:
+                    logger.error(
+                        "Batch %s/%s failed after %s attempts: %s",
+                        idx,
+                        len(batches),
+                        max_retries,
+                        exc,
+                    )
+                    raise
+                logger.warning(
+                    "Batch %s/%s failed (attempt %s/%s): %s; retrying",
+                    idx,
+                    len(batches),
+                    attempt,
+                    max_retries,
+                    exc,
+                )
+    return
+
+
+def chunked(seq: Sequence, size: int) -> Iterator[List]:
+    for i in range(0, len(seq), size):
+        yield list(seq[i : i + size])
+
+
+def tag_and_push(images: Iterable[str], prefix: str) -> list[str]:
+    """
+    Tag the provided images with the registry prefix and push them.
+    """
+    pushed: list[str] = []
+    prefix = prefix.rstrip("/")
+    for image in images:
+        target = f"{prefix}/{image}"
+        logger.info("Pushing %s -> %s", image, target)
+        subprocess_run(["docker", "tag", image, target])
+        subprocess_run(["docker", "push", target])
+        pushed.append(target)
+    return pushed
+
+
+def subprocess_run(cmd: list[str]) -> None:
+    import subprocess
+
+    result = subprocess.run(cmd, text=True, capture_output=True)
+    if result.returncode != 0:
+        logger.error("Command failed (%s): %s", " ".join(cmd), result.stderr)
+        raise RuntimeError(f"Command failed: {' '.join(cmd)}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build and push prebaked SWT-bench eval env images."
+    )
+    parser.add_argument("--dataset", required=True, help="Dataset name")
+    parser.add_argument("--split", default="test", help="Dataset split")
+    parser.add_argument(
+        "--eval-limit",
+        type=int,
+        default=1,
+        help="Match inference sampling by limiting instances (0 to disable)",
+    )
+    parser.add_argument(
+        "--instance-ids",
+        default="",
+        help="Comma-separated instance IDs to force (overrides eval-limit)",
+    )
+    parser.add_argument(
+        "--selected-instances-file",
+        default="",
+        help="Optional selected instances file used during inference",
+    )
+    parser.add_argument(
+        "--image-prefix",
+        default="ghcr.io/openhands/swtbench-eval",
+        help="Registry prefix for pushed images",
+    )
+    parser.add_argument(
+        "--max-workers",
+        type=int,
+        default=4,
+        help="Parallel builds for env images",
+    )
+    parser.add_argument(
+        "--max-retries",
+        type=int,
+        default=2,
+        help="Retries per batch for env image builds",
+    )
+    parser.add_argument(
+        "--build-batch-size",
+        type=int,
+        default=10,
+        help="Number of env images to build per batch",
+    )
+    parser.add_argument(
+        "--build-mode",
+        choices=["api", "cli"],
+        default="cli",
+        help="swt-bench build mode",
+    )
+    parser.add_argument(
+        "--no-push",
+        action="store_true",
+        help="Build images locally without pushing to the registry",
+    )
+    args = parser.parse_args()
+
+    instance_ids = (
+        [iid for iid in args.instance_ids.split(",") if iid]
+        if args.instance_ids
+        else None
+    )
+    eval_limit = None if instance_ids else args.eval_limit
+    selected_file = args.selected_instances_file or None
+
+    swt_bench_dir = ensure_swt_bench_repo()
+
+    target_ids = select_instance_ids(
+        dataset=args.dataset,
+        split=args.split,
+        eval_limit=eval_limit,
+        selected_instances_file=selected_file,
+        instance_ids=instance_ids,
+    )
+    exec_specs = load_exec_specs(
+        swt_bench_dir, args.dataset, args.split, target_ids, filter_swt=True
+    )
+    build_env_images(
+        exec_specs,
+        max_workers=args.max_workers,
+        build_mode=args.build_mode,
+        max_retries=args.max_retries,
+        batch_size=args.build_batch_size,
+        image_prefix=None if args.no_push else args.image_prefix,
+    )
+
+    base_images = {spec.base_image_key for spec in exec_specs}
+    env_images = {spec.env_image_key for spec in exec_specs}
+    logger.info("Built images: %s base, %s env", len(base_images), len(env_images))
+
+    manifest = {
+        "dataset": args.dataset,
+        "split": args.split,
+        "instances": target_ids,
+        "base_images": sorted(base_images),
+        "env_images": sorted(env_images),
+        "image_prefix": args.image_prefix,
+        "arch": "host",
+    }
+    print(json.dumps(manifest, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index 7501bf23..9dc7062b 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -18,6 +18,10 @@
 from pathlib import Path
 from time import monotonic
 
+from benchmarks.swtbench.image_utils import (
+    compute_required_images,
+    ensure_swt_bench_repo,
+)
 from benchmarks.utils.laminar import LaminarService
 from benchmarks.utils.patch_utils import remove_files_from_patch
 from benchmarks.utils.report_costs import generate_cost_report
@@ -26,56 +30,7 @@
 
 logger = get_logger(__name__)
 
-
-def patch_swt_bench_for_micromamba(swt_bench_dir: Path) -> None:
-    """
-    Ensure the cached swt-bench checkout uses micromamba for env creation.
-    Applies small, idempotent text replacements to the upstream sources.
-    """
-    solver_timeout_s = 600
-    dockerfiles_path = swt_bench_dir / "src" / "dockerfiles.py"
-    exec_spec_path = swt_bench_dir / "src" / "exec_spec.py"
-
-    if not dockerfiles_path.exists() or not exec_spec_path.exists():
-        logger.warning(
-            "swt-bench sources missing expected files; skipping micromamba patch "
-            f"(dockerfiles: {dockerfiles_path.exists()}, exec_spec: {exec_spec_path.exists()})"
-        )
-        return
-
-    dockerfiles_text = dockerfiles_path.read_text()
-    dockerfiles_updated = dockerfiles_text.replace(
-        "RUN conda config --append channels conda-forge\n\nRUN adduser",
-        "RUN conda config --append channels conda-forge\n"
-        "# Use micromamba for faster solver performance during env builds\n"
-        "RUN conda install -n base -c conda-forge -y micromamba \\\n"
-        " && ln -s /opt/miniconda3/bin/micromamba /usr/local/bin/micromamba\n"
-        "ENV MAMBA_ROOT_PREFIX=/opt/miniconda3\n"
-        "ENV MAMBA_EXE=/opt/miniconda3/bin/micromamba\n\n"
-        "RUN adduser",
-    )
-
-    exec_spec_text = exec_spec_path.read_text()
-    replacements = {
-        "conda create -n ": f"timeout {solver_timeout_s}s micromamba create -n ",
-        "conda create -c conda-forge -n ": f"timeout {solver_timeout_s}s micromamba create -c conda-forge -n ",
-        "conda env create --file": f"timeout {solver_timeout_s}s micromamba env create --file",
-        "conda env update -f": f"timeout {solver_timeout_s}s micromamba env update -f",
-        "conda install python=": f"timeout {solver_timeout_s}s micromamba install python=",
-    }
-    for old, new in replacements.items():
-        exec_spec_text = exec_spec_text.replace(old, new)
-
-    if dockerfiles_text != dockerfiles_updated:
-        dockerfiles_path.write_text(dockerfiles_updated)
-        logger.info("Patched swt-bench Dockerfile template to install micromamba.")
-    if exec_spec_path.read_text() != exec_spec_text:
-        exec_spec_path.write_text(exec_spec_text)
-        logger.info(
-            "Patched swt-bench exec_spec to create/update envs with micromamba "
-            "and a %ss timeout on solver calls.",
-            solver_timeout_s,
-        )
+PREBAKED_REGISTRY = "ghcr.io/openhands/swtbench-eval"
 
 
 def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
@@ -109,6 +64,60 @@ def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
     return instance_ids
 
 
+def try_pull_prebaked_images(
+    predictions_file: Path,
+    dataset: str,
+    split: str = "test",
+    registry: str = PREBAKED_REGISTRY,
+) -> None:
+    """
+    Best-effort pull of prebaked base/env images; no-op on failure.
+    """
+    try:
+        base_images, env_images = compute_required_images(
+            predictions_file,
+            dataset,
+            split,
+        )
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning("Skipping prebaked image pull (compute failed): %s", exc)
+        return
+
+    tags = sorted(base_images | env_images)
+    if not tags:
+        logger.info("No prebaked images to pull (empty tag set)")
+        return
+
+    registry = registry.rstrip("/")
+    for tag in tags:
+        remote = f"{registry}/{tag}"
+        logger.info("Attempting to pull prebaked image %s", remote)
+        try:
+            pull = subprocess.run(
+                ["docker", "pull", remote],
+                capture_output=True,
+                text=True,
+            )
+        except FileNotFoundError:
+            logger.warning("Docker not available; skipping prebaked image pull")
+            return
+
+        if pull.returncode != 0:
+            logger.warning("Failed to pull %s: %s", remote, pull.stderr.strip())
+            continue
+
+        # Tag the remote image with the local name expected by the harness.
+        tag_res = subprocess.run(
+            ["docker", "tag", remote, tag],
+            capture_output=True,
+            text=True,
+        )
+        if tag_res.returncode != 0:
+            logger.warning("Failed to tag %s as %s: %s", remote, tag, tag_res.stderr)
+        else:
+            logger.info("Pulled and tagged %s -> %s", remote, tag)
+
+
 def update_report_with_submitted_instances(
     report_path: Path, predictions_path: Path
 ) -> None:
@@ -242,32 +251,12 @@ def run_swtbench_evaluation(
         dataset: SWT-Bench dataset to evaluate against
         workers: Number of workers to use for evaluation
     """
-    logger.info(f"Running SWT-Bench evaluation on {predictions_file}")
+    use_legacy = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() in ("1", "true", "yes")
+    mode = "legacy-conda" if use_legacy else "prebaked-images"
+    logger.info("Running SWT-Bench evaluation on %s (mode=%s)", predictions_file, mode)
 
     try:
-        # Use a global cache directory for SWT-Bench source
-        cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench"
-        swt_bench_dir = cache_dir / "swt-bench"
-
-        # Clone SWT-Bench repository if it doesn't exist
-        if not swt_bench_dir.exists():
-            logger.info("Setting up SWT-Bench source in global cache...")
-            cache_dir.mkdir(parents=True, exist_ok=True)
-
-            logger.info("Cloning SWT-Bench repository...")
-            clone_cmd = [
-                "git",
-                "clone",
-                "https://github.com/logic-star-ai/swt-bench.git",
-                str(swt_bench_dir),
-            ]
-            result = subprocess.run(clone_cmd, text=True)
-            if result.returncode != 0:
-                raise subprocess.CalledProcessError(result.returncode, clone_cmd)
-
-            logger.info(f"SWT-Bench source installed at {swt_bench_dir}")
-
-        patch_swt_bench_for_micromamba(swt_bench_dir)
+        swt_bench_dir = ensure_swt_bench_repo()
 
         # Get the directory and filename of the predictions file
         predictions_path = Path(predictions_file).resolve()
@@ -424,6 +413,23 @@ def main() -> None:
         # Convert format
         convert_to_swtbench_format(str(input_file), str(output_file), args.model_name)
 
+        # Default: use prebaked images; SWTbenCH_FORCE_CONDA opts into legacy flow.
+        use_prebaked = os.getenv("SWTBENCH_FORCE_CONDA", "").lower() not in (
+            "1",
+            "true",
+            "yes",
+        )
+        if use_prebaked:
+            try_pull_prebaked_images(
+                output_file,
+                args.dataset,
+            )
+        else:
+            logger.info(
+                "SWTBENCH_FORCE_CONDA set; skipping prebaked image pull "
+                "and using legacy (pre-mamba) evaluation flow"
+            )
+
         if not args.skip_evaluation:
             eval_phase_start = monotonic()
             # Run evaluation
diff --git a/benchmarks/swtbench/image_utils.py b/benchmarks/swtbench/image_utils.py
new file mode 100644
index 00000000..e7aae1f4
--- /dev/null
+++ b/benchmarks/swtbench/image_utils.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+import json
+import logging
+import subprocess
+import sys
+from pathlib import Path
+from typing import Iterable
+
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def ensure_swt_bench_repo(cache_dir: Path | None = None) -> Path:
+    """
+    Ensure the SWT-bench sources are available locally.
+
+    Returns the repository path under the cache directory.
+    """
+    cache_dir = cache_dir or Path.home() / ".cache" / "openhands" / "swt-bench"
+    swt_bench_dir = cache_dir / "swt-bench"
+
+    if swt_bench_dir.exists():
+        return swt_bench_dir
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("Cloning SWT-Bench repository into %s", swt_bench_dir)
+    result = subprocess.run(
+        [
+            "git",
+            "clone",
+            "https://github.com/logic-star-ai/swt-bench.git",
+            str(swt_bench_dir),
+        ],
+        text=True,
+        capture_output=True,
+    )
+    if result.returncode != 0:
+        logger.error("Failed to clone swt-bench: %s", result.stderr)
+        raise RuntimeError("Unable to clone swt-bench repository")
+
+    return swt_bench_dir
+
+
+def _load_instance_ids(output_jsonl: Path) -> list[str]:
+    instance_ids: list[str] = []
+    seen = set()
+    with output_jsonl.open("r", encoding="utf-8") as infile:
+        for line_num, line in enumerate(infile, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                logger.debug("Skipping invalid JSON on line %s", line_num)
+                continue
+            instance_id = data.get("instance_id")
+            if not instance_id or instance_id in seen:
+                continue
+            seen.add(instance_id)
+            instance_ids.append(instance_id)
+    return instance_ids
+
+
+def compute_required_images(
+    output_jsonl: Path,
+    dataset: str,
+    split: str,
+) -> tuple[set[str], set[str]]:
+    """
+    Compute the base/env image tags required to evaluate the given predictions file.
+
+    Returns (base_image_tags, env_image_tags).
+    """
+    instance_ids = _load_instance_ids(output_jsonl)
+    if not instance_ids:
+        raise ValueError(f"No instance_ids found in {output_jsonl}")
+
+    swt_bench_dir = ensure_swt_bench_repo()
+    sys.path.insert(0, str(swt_bench_dir / "src"))
+    sys.path.insert(0, str(swt_bench_dir))
+
+    # Delay import until after sys.path manipulation so we use the cached checkout.
+    from src.dataset import load_swebench_dataset  # type: ignore[import-not-found]
+    from src.exec_spec import make_exec_spec  # type: ignore[import-not-found]
+
+    dataset_entries = load_swebench_dataset(
+        name=dataset, split=split, is_swt=True, filter_swt=True
+    )
+    entries_by_id = {entry["instance_id"]: entry for entry in dataset_entries}
+
+    missing = [iid for iid in instance_ids if iid not in entries_by_id]
+    if missing:
+        logger.warning(
+            "Predictions reference %s instance_ids not present in dataset: %s",
+            len(missing),
+            ", ".join(missing[:5]),
+        )
+
+    specs = [
+        make_exec_spec(entries_by_id[iid])
+        for iid in instance_ids
+        if iid in entries_by_id
+    ]
+    if not specs:
+        raise RuntimeError("No ExecSpecs produced; cannot compute required images.")
+
+    base_images = {spec.base_image_key for spec in specs}
+    env_images = {spec.env_image_key for spec in specs}
+    logger.info(
+        "Computed %s base images and %s env images for %s instances",
+        len(base_images),
+        len(env_images),
+        len(specs),
+    )
+    return base_images, env_images
+
+
+def format_images_plain(images: Iterable[str]) -> str:
+    return "\n".join(sorted(images))
+
+
+def main() -> None:
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="List SWT-bench base/env images required for a predictions file."
+    )
+    parser.add_argument("output_jsonl", type=Path, help="Path to output.jsonl")
+    parser.add_argument("--dataset", required=True, help="Dataset name")
+    parser.add_argument("--split", default="test", help="Dataset split")
+    parser.add_argument(
+        "--format",
+        choices=["plain", "json"],
+        default="plain",
+        help="Output format",
+    )
+    args = parser.parse_args()
+
+    base_images, env_images = compute_required_images(
+        args.output_jsonl,
+        args.dataset,
+        args.split,
+    )
+    payload = {
+        "base": sorted(base_images),
+        "env": sorted(env_images),
+    }
+
+    if args.format == "json":
+        print(json.dumps(payload))
+    else:
+        print(format_images_plain(payload["base"] + payload["env"]))
+
+
+if __name__ == "__main__":
+    # Configure root logging for ad-hoc usage
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 8c842312..f65dfaf3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,8 @@ swebench-infer = "benchmarks.swebench.run_infer:main"
 swtbench-infer = "benchmarks.swtbench.run_infer:main"
 swebench-eval = "benchmarks.swebench.eval_infer:main"
 swtbench-eval = "benchmarks.swtbench.eval_infer:main"
+swtbench-list-images = "benchmarks.swtbench.image_utils:main"
+swtbench-build-eval-images = "benchmarks.swtbench.build_eval_env_images:main"
 gaia-infer = "benchmarks.gaia.run_infer:main"
 gaia-eval = "benchmarks.gaia.eval_infer:main"
 commit0-infer = "benchmarks.commit0.run_infer:main"
diff --git a/uv.lock b/uv.lock
index e7351742..7b04cc65 100644
--- a/uv.lock
+++ b/uv.lock
@@ -947,11 +947,11 @@ wheels = [
 
 [[package]]
 name = "filelock"
-version = "3.19.1"
+version = "3.20.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" },
 ]
 
 [[package]]
@@ -1678,11 +1678,11 @@ wheels = [
 
 [[package]]
 name = "libtmux"
-version = "0.46.2"
+version = "0.53.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9c/aa/7e1dcaa097156d6f3a7d8669be4389dced997feeb81744e3ff4681d65ee8/libtmux-0.46.2.tar.gz", hash = "sha256:9a398fec5d714129c8344555d466e1a903dfc0f741ba07aabe75a8ceb25c5dda", size = 346887, upload-time = "2025-05-26T19:40:04.096Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/28/e2b252817cb181aec2f42fe2d1d7fac5ec9c4d15bfb2b8ea4bd1179e4244/libtmux-0.53.0.tar.gz", hash = "sha256:1d19af4cea0c19543954d7e7317c7025c0739b029cccbe3b843212fae238f1bd", size = 405001, upload-time = "2025-12-14T11:59:11.337Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d6/2f/9d207039fcfa00d3b30e4d765f062fbcc42c873c7518a8cfebb3eafd00e0/libtmux-0.46.2-py3-none-any.whl", hash = "sha256:6c32dbf22bde8e5e33b2714a4295f6e838dc640f337cd4c085a044f6828c7793", size = 60873, upload-time = "2025-05-26T19:40:02.284Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/d0/2e8bc5caa639ebb9f8801ba0be7070a28d48d8ed60e2a428d40f71fb88b8/libtmux-0.53.0-py3-none-any.whl", hash = "sha256:024b7ae6a12aae55358e8feb914c8632b3ab9bd61c0987c53559643c6a58ee4f", size = 77582, upload-time = "2025-12-14T11:59:09.739Z" },
 ]
 
 [[package]]
@@ -2269,7 +2269,7 @@ wheels = [
 
 [[package]]
 name = "openhands-agent-server"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-agent-server" }
 dependencies = [
     { name = "aiosqlite" },
@@ -2407,11 +2407,12 @@ dev = [
 
 [[package]]
 name = "openhands-sdk"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-sdk" }
 dependencies = [
     { name = "deprecation" },
     { name = "fastmcp" },
+    { name = "filelock" },
     { name = "httpx" },
     { name = "litellm" },
     { name = "lmnr" },
@@ -2432,10 +2433,11 @@ requires-dist = [
     { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" },
     { name = "deprecation", specifier = ">=2.1.0" },
     { name = "fastmcp", specifier = ">=2.11.3" },
+    { name = "filelock", specifier = ">=3.20.1" },
     { name = "httpx", specifier = ">=0.27.0" },
     { name = "litellm", specifier = ">=1.80.10" },
     { name = "lmnr", specifier = ">=0.7.24" },
-    { name = "pydantic", specifier = ">=2.11.7" },
+    { name = "pydantic", specifier = ">=2.12.5" },
     { name = "python-frontmatter", specifier = ">=1.1.0" },
     { name = "python-json-logger", specifier = ">=3.3.0" },
     { name = "tenacity", specifier = ">=9.1.2" },
@@ -2445,7 +2447,7 @@ provides-extras = ["boto3"]
 
 [[package]]
 name = "openhands-tools"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-tools" }
 dependencies = [
     { name = "bashlex" },
@@ -2466,7 +2468,7 @@ requires-dist = [
     { name = "browser-use", specifier = ">=0.8.0" },
     { name = "cachetools" },
     { name = "func-timeout", specifier = ">=4.3.5" },
-    { name = "libtmux", specifier = ">=0.46.2" },
+    { name = "libtmux", specifier = ">=0.53.0" },
     { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" },
     { name = "pydantic", specifier = ">=2.11.7" },
     { name = "tom-swe", specifier = ">=1.0.3" },
@@ -2474,7 +2476,7 @@ requires-dist = [
 
 [[package]]
 name = "openhands-workspace"
-version = "1.7.2"
+version = "1.8.1"
 source = { editable = "vendor/software-agent-sdk/openhands-workspace" }
 dependencies = [
     { name = "openhands-agent-server" },