From 070311c72a26186274de7bfe00d9f07c2aaba1e3 Mon Sep 17 00:00:00 2001 From: Carlos Date: Mon, 29 Dec 2025 22:14:36 +0100 Subject: [PATCH 1/2] fix: Remove blocking wait for GCP VM deletion in progress_reporter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The progress_reporter endpoint was timing out (502 errors) when tests completed because it synchronously waited for GCP VM deletion, which can take 60+ seconds. This exceeded nginx's default 60s proxy timeout. The fix makes VM deletion fire-and-forget: - Initiate deletion but don't block waiting for completion - The VM will be deleted eventually - we don't need confirmation - All critical work (test results, GitHub status) completes first - Added logging to track initiated deletions Investigation on production server showed: - 11% of webhook requests returned 502 errors (11 out of 100 today) - nginx error logs showed "upstream timed out" for /progress-reporter/ - Pattern: Test completion logs showed ~60-70s between "Test completed" and "Test " due to wait_for_operation blocking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- mod_ci/controllers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 051c68ae..530e3844 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -1960,14 +1960,18 @@ def update_final_status(): log.error(f"Test {test_id}: Failed to update final GitHub status after retries: {e}") if status in [TestStatus.completed, TestStatus.canceled]: - # Delete the current instance + # Delete the current instance (fire-and-forget) + # We intentionally don't wait for the deletion to complete because: + # 1. Waiting can take 60+ seconds, exceeding nginx/gunicorn timeouts (502 errors) + # 2. The deletion will complete eventually - we don't need confirmation + # 3. All important work (test results, GitHub status) is already done from run import config compute = get_compute_service_object() zone = config.get('ZONE', '') project = config.get('PROJECT_NAME', '') vm_name = f"{test.platform.value}-{test.id}" operation = delete_instance(compute, project, zone, vm_name) - wait_for_operation(compute, project, zone, operation['name']) + log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (operation: {operation.get('name', 'unknown')})") # If status is complete, remove the GCP Instance entry if status in [TestStatus.completed, TestStatus.canceled]: From 1c281a5003ca7b0c2b50046986edcec092f32685 Mon Sep 17 00:00:00 2001 From: Carlos Date: Mon, 29 Dec 2025 22:20:54 +0100 Subject: [PATCH 2/2] style: Fix line too long in log message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Break long log line into two lines to comply with 120 char limit. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- mod_ci/controllers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 530e3844..2066b87e 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -1971,7 +1971,8 @@ def update_final_status(): project = config.get('PROJECT_NAME', '') vm_name = f"{test.platform.value}-{test.id}" operation = delete_instance(compute, project, zone, vm_name) - log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (operation: {operation.get('name', 'unknown')})") + op_name = operation.get('name', 'unknown') + log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (op: {op_name})") # If status is complete, remove the GCP Instance entry if status in [TestStatus.completed, TestStatus.canceled]: