Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b6ed901
Add prebaked SWT-bench eval image build support
simonrosenberg Jan 14, 2026
f00bc1e
Add arch override for SWT-bench eval image builds
simonrosenberg Jan 14, 2026
ee1b5a6
Let swtbench workflow build prebaked eval env images
simonrosenberg Jan 14, 2026
c0d1432
Expose prebaked eval images publicly in swtbench workflow
simonrosenberg Jan 14, 2026
ce669ad
Make micromamba patch optional for eval image builds
simonrosenberg Jan 14, 2026
e523d95
Drop micromamba fallback in eval image build
simonrosenberg Jan 15, 2026
655d2ce
Remove redundant eval-only workflow
simonrosenberg Jan 15, 2026
cfab9a9
Add verbose diagnostics around eval image build step
simonrosenberg Jan 15, 2026
154a159
Use buildx/cli path for eval images and add runner diagnostics
simonrosenberg Jan 15, 2026
6380c41
Add batching/retries and buildx settings to SWT-bench eval builds
simonrosenberg Jan 15, 2026
328ef48
Remove local platform override; keep buildx batching/retries
simonrosenberg Jan 15, 2026
5102770
Skip rebuilding existing swtbench images
simonrosenberg Jan 15, 2026
9610efd
Reuse remote SWT-bench images and fix dataset cwd
simonrosenberg Jan 15, 2026
0e1ddce
Only build/push when registry missing
simonrosenberg Jan 15, 2026
7a0f182
Always pull remote base images; drop local fallback
simonrosenberg Jan 15, 2026
a323729
Drop micromamba patch; fail if prebaked images missing
simonrosenberg Jan 15, 2026
8ac449d
Format with pre-commit
simonrosenberg Jan 15, 2026
1f7248a
Push eval images as they are built
simonrosenberg Jan 15, 2026
fd45583
Merge branch 'main' into swtbench-prebaked-eval
simonrosenberg Jan 16, 2026
d10418c
Fallback to micromamba when prebaked swtbench eval images missing
simonrosenberg Jan 16, 2026
ab15ea7
Revert "Fallback to micromamba when prebaked swtbench eval images mis…
simonrosenberg Jan 16, 2026
af6d559
Remove unused micromamba patching from swtbench eval
simonrosenberg Jan 16, 2026
1531ce5
Remove unused eval arch override path for swtbench prebaked images
simonrosenberg Jan 16, 2026
f1d114e
Strip top-level files from SWTBench patches
simonrosenberg Jan 16, 2026
1c5c599
Enable is_swt flag in swtbench eval
simonrosenberg Jan 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions .github/workflows/build-swtbench-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@ on:
description: 'Software Agent SDK commit/ref to use'
required: true
type: string
build-eval-env:
description: 'Also build prebaked SWT-bench eval env images (default: false)'
required: false
default: 'false'
type: string
eval-image-prefix:
description: 'Registry prefix for prebaked eval images'
required: false
default: 'ghcr.io/openhands/swtbench-eval'
type: string
max-retries:
description: 'Retries per batch for eval env builds'
required: false
default: '2'
type: string
build-batch-size:
description: 'Env images per batch for eval env builds'
required: false
default: '10'
type: string

concurrency:
group: build-swt-bench-${{ github.ref }}
Expand Down Expand Up @@ -158,6 +178,64 @@ jobs:
DOCKER_BUILDKIT: 1
BUILDKIT_PROGRESS: plain

- name: Build prebaked eval env images
if: ${{ inputs.build-eval-env == 'true' }}
run: |
set -euo pipefail

echo "Starting prebaked eval env image build at $(date -u)"
echo "Runner: $(uname -a)"
df -h
docker system df || true
docker info || true

DATASET="${{ inputs.dataset || 'princeton-nlp/SWE-bench_Verified' }}"
SPLIT="${{ inputs.split || 'test' }}"
N_LIMIT="${{ inputs.n-limit || '0' }}"
INSTANCE_IDS="${{ inputs.instance-ids }}"
IMAGE_PREFIX="${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}"
MAX_WORKERS="${{ inputs.max-workers || '4' }}"
BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
MAX_RETRIES="${{ inputs.max-retries || '2' }}"
BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}"

echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV"
echo "MAX_RETRIES=${MAX_RETRIES}" >> "$GITHUB_ENV"
echo "BUILD_BATCH_SIZE=${BUILD_BATCH_SIZE}" >> "$GITHUB_ENV"

# Basic BuildKit disk guard similar to SWE-bench
if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
LINE=$(tail -n1 /tmp/buildkit_df)
TOTAL=$(echo "$LINE" | awk '{print $2}')
USED=$(echo "$LINE" | awk '{print $3}')
FREE=$(echo "$LINE" | awk '{print $4}')
if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
PCT=$(( 100 * USED / TOTAL ))
echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
else
echo "Warning: unable to parse df output for /var/lib/buildkit"
fi
else
echo "Warning: /var/lib/buildkit not found; skipping disk check"
fi

ARGS=(--dataset "${DATASET}" --split "${SPLIT}" --image-prefix "${IMAGE_PREFIX}" --max-workers "${MAX_WORKERS}" --build-mode "${BUILD_MODE}" --max-retries "${MAX_RETRIES}" --build-batch-size "${BUILD_BATCH_SIZE}")
if [ -n "${INSTANCE_IDS}" ]; then
ARGS+=(--instance-ids "${INSTANCE_IDS}")
else
ARGS+=(--eval-limit "${N_LIMIT}")
fi

echo "Running prebaked eval image build: uv run swtbench-build-eval-images ${ARGS[*]}"
DOCKER_BUILDKIT=1 \
BUILDKIT_PROGRESS=plain \
BUILDKIT_RESET_ON_FAILURE=1 \
PYTHONUNBUFFERED=1 uv run swtbench-build-eval-images "${ARGS[@]}" | tee build_eval_env.log

echo "Completed prebaked eval env image build at $(date -u)"
docker ps -a || true
docker system df || true

- name: Archive build logs
if: always()
run: |
Expand Down Expand Up @@ -199,6 +277,23 @@ jobs:
echo "**Successful:** $SUCCESS_COUNT ✅" >> "$GITHUB_STEP_SUMMARY"
echo "**Failed:** $FAIL_COUNT ❌" >> "$GITHUB_STEP_SUMMARY"

- name: Make prebaked eval image package public (best-effort)
if: ${{ inputs.build-eval-env == 'true' }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
IMAGE_PREFIX: ${{ inputs.eval-image-prefix || 'ghcr.io/openhands/swtbench-eval' }}
run: |
set -euo pipefail
NAME=$(echo "${IMAGE_PREFIX}" | awk -F/ '{print $NF}')
if [ -z "$NAME" ]; then
echo "No package name derived from IMAGE_PREFIX=${IMAGE_PREFIX}, skipping visibility update"
exit 0
fi
gh api -X PATCH \
-H "Accept: application/vnd.github+json" \
/orgs/OpenHands/packages/container/${NAME}/visibility \
-f visibility=public || echo "Warning: failed to set package visibility"

- name: Comment on tracker issue
if: success()
run: |
Expand Down
Loading