From 070311c72a26186274de7bfe00d9f07c2aaba1e3 Mon Sep 17 00:00:00 2001
From: Carlos <carlos@sanz.dev>
Date: Mon, 29 Dec 2025 22:14:36 +0100
Subject: [PATCH 1/2] fix: Remove blocking wait for GCP VM deletion in
 progress_reporter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The progress_reporter endpoint was timing out (502 errors) when tests
completed because it synchronously waited for GCP VM deletion, which
can take 60+ seconds. This exceeded nginx's default 60s proxy timeout.

The fix makes VM deletion fire-and-forget:
- Initiate deletion but don't block waiting for completion
- The VM will be deleted eventually - we don't need confirmation
- All critical work (test results, GitHub status) completes first
- Added logging to track initiated deletions

Investigation on production server showed:
- 11% of webhook requests returned 502 errors (11 out of 100 today)
- nginx error logs showed "upstream timed out" for /progress-reporter/
- Pattern: Test completion logs showed ~60-70s between "Test completed"
  and "Test <completed>" due to wait_for_operation blocking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 mod_ci/controllers.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py
index 051c68ae..530e3844 100755
--- a/mod_ci/controllers.py
+++ b/mod_ci/controllers.py
@@ -1960,14 +1960,18 @@ def update_final_status():
         log.error(f"Test {test_id}: Failed to update final GitHub status after retries: {e}")
 
     if status in [TestStatus.completed, TestStatus.canceled]:
-        # Delete the current instance
+        # Delete the current instance (fire-and-forget)
+        # We intentionally don't wait for the deletion to complete because:
+        # 1. Waiting can take 60+ seconds, exceeding nginx/gunicorn timeouts (502 errors)
+        # 2. The deletion will complete eventually - we don't need confirmation
+        # 3. All important work (test results, GitHub status) is already done
         from run import config
         compute = get_compute_service_object()
         zone = config.get('ZONE', '')
         project = config.get('PROJECT_NAME', '')
         vm_name = f"{test.platform.value}-{test.id}"
         operation = delete_instance(compute, project, zone, vm_name)
-        wait_for_operation(compute, project, zone, operation['name'])
+        log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (operation: {operation.get('name', 'unknown')})")
 
     # If status is complete, remove the GCP Instance entry
     if status in [TestStatus.completed, TestStatus.canceled]:

From 1c281a5003ca7b0c2b50046986edcec092f32685 Mon Sep 17 00:00:00 2001
From: Carlos <carlos@sanz.dev>
Date: Mon, 29 Dec 2025 22:20:54 +0100
Subject: [PATCH 2/2] style: Fix line too long in log message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Break long log line into two lines to comply with 120 char limit.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 mod_ci/controllers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py
index 530e3844..2066b87e 100755
--- a/mod_ci/controllers.py
+++ b/mod_ci/controllers.py
@@ -1971,7 +1971,8 @@ def update_final_status():
         project = config.get('PROJECT_NAME', '')
         vm_name = f"{test.platform.value}-{test.id}"
         operation = delete_instance(compute, project, zone, vm_name)
-        log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (operation: {operation.get('name', 'unknown')})")
+        op_name = operation.get('name', 'unknown')
+        log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (op: {op_name})")
 
     # If status is complete, remove the GCP Instance entry
     if status in [TestStatus.completed, TestStatus.canceled]: