From 0e2be0c3f9a6109aa4e3e997e4792d24cc2d6ef6 Mon Sep 17 00:00:00 2001
From: Carlos <carlos@sanz.dev>
Date: Tue, 30 Dec 2025 09:59:58 +0100
Subject: [PATCH 1/2] fix: Remove remaining blocking wait_for_operation calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR removes the last two blocking wait_for_operation calls that were
causing gunicorn workers to be blocked for extended periods:

1. delete_expired_instances() - VM deletion is now fire-and-forget
2. start_test() - VM creation is now optimistic (recorded immediately)

These blocking calls were causing 504 timeouts on webhook deliveries
because GitHub has a 10-second webhook timeout. When cron jobs were
running wait_for_operation (which can take up to 30 minutes), all
gunicorn workers could become blocked, causing webhook requests to
queue and exceed the timeout.

Changes:
- delete_expired_instances: Remove blocking wait, log operation initiation
- start_test: Check for immediate errors, then record instance optimistically
- Tests: Remove unused wait_for_operation mocks from affected tests

If VM creation ultimately fails, the test won't report progress and will
be cleaned up by the expired instances cron job.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 mod_ci/controllers.py             | 34 +++++++++++++++++---------
 tests/test_ci/test_controllers.py | 40 +++++++++++--------------------
 2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py
index 2066b87e..24997472 100755
--- a/mod_ci/controllers.py
+++ b/mod_ci/controllers.py
@@ -463,9 +463,12 @@ def delete_expired_instances(compute, max_runtime, project, zone, db, repository
                 if gh_commit is not None:
                     update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {platform_name}")
 
-                # Delete VM instance
+                # Delete VM instance (fire-and-forget to avoid blocking workers)
+                # The deletion will complete eventually - we don't need confirmation
                 operation = delete_instance(compute, project, zone, vm_name)
-                wait_for_operation(compute, project, zone, operation['name'])
+                from run import log
+                op_name = operation.get('name', 'unknown')
+                log.info(f"Expired instance deletion initiated for {vm_name} (op: {op_name})")
 
 
 def gcp_instance(app, db, platform, repository, delay) -> None:
@@ -902,16 +905,25 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to
     zone = config.get('ZONE', '')
     project_id = config.get('PROJECT_NAME', '')
     operation = create_instance(compute, project_id, zone, test, full_url)
-    result = wait_for_operation(compute, project_id, zone, operation['name'])
-    # Check if result indicates success (result is a dict with no 'error' key)
-    if isinstance(result, dict) and 'error' not in result:
-        db.add(status)
-        if not safe_db_commit(db, f"recording GCP instance for test {test.id}"):
-            log.error(f"Failed to record GCP instance for test {test.id}, but VM was created")
-    else:
-        error_msg = parse_gcp_error(result)
-        log.error(f"Error creating test instance for test {test.id}, result: {result}")
+
+    # Check if the create_instance call itself returned an error (synchronous failure)
+    if 'error' in operation:
+        error_msg = parse_gcp_error(operation)
+        log.error(f"Error creating test instance for test {test.id}, result: {operation}")
         mark_test_failed(db, test, repository, error_msg)
+        return
+
+    # VM creation request was accepted - record the instance optimistically
+    # We don't wait for the operation to complete because:
+    # 1. Waiting can take 60+ seconds, blocking gunicorn workers
+    # 2. If VM creation ultimately fails, the test won't report progress
+    #    and will be cleaned up by the expired instances cron job
+    op_name = operation.get('name', 'unknown')
+    log.info(f"Test {test.id}: VM creation initiated (op: {op_name})")
+
+    db.add(status)
+    if not safe_db_commit(db, f"recording GCP instance for test {test.id}"):
+        log.error(f"Failed to record GCP instance for test {test.id}, but VM creation was initiated")
 
 
 def create_instance(compute, project, zone, test, reportURL) -> Dict:
diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py
index 63a38332..ac48ed0a 100644
--- a/tests/test_ci/test_controllers.py
+++ b/tests/test_ci/test_controllers.py
@@ -215,14 +215,13 @@ def test_cron_job_empty_token(self, mock_log):
         cron()
         mock_log.error.assert_called_with('GITHUB_TOKEN not configured, cannot run CI cron')
 
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.create_instance')
     @mock.patch('builtins.open', new_callable=mock.mock_open())
     @mock.patch('mod_ci.controllers.g')
     @mock.patch('mod_ci.controllers.TestProgress')
     @mock.patch('mod_ci.controllers.GcpInstance')
     def test_start_test(self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file,
-                        mock_create_instance, mock_wait_for_operation):
+                        mock_create_instance):
         """Test start_test function."""
         import zipfile
 
@@ -268,21 +267,21 @@ def extractall(*args, **kwargs):
         mock_query = create_mock_db_query(mock_g)
         mock_query.c.got = MagicMock()
 
-        # Test when gcp create instance fails
-        mock_wait_for_operation.return_value = {'status': 'DONE', 'error': {'errors': [{'code': 'TEST_ERROR'}]}}
+        # Test when gcp create instance fails synchronously (error in operation response)
+        mock_create_instance.return_value = {'error': {'errors': [{'code': 'TEST_ERROR'}]}}
         start_test(mock.ANY, self.app, mock_g.db, repository, test, mock.ANY)
         # Commit IS called to record the test failure in the database
         mock_g.db.commit.assert_called_once()
         mock_g.db.commit.reset_mock()
         mock_create_instance.reset_mock()
-        mock_wait_for_operation.reset_mock()
 
-        # Test when gcp create instance is successful
-        mock_wait_for_operation.return_value = {'status': 'DONE'}
+        # Test when gcp create instance is successful (no error in operation response)
+        # Note: We no longer wait for the operation to complete - we record the instance
+        # optimistically and let the expired instances cron handle failures
+        mock_create_instance.return_value = {'name': 'test-operation-123', 'status': 'RUNNING'}
         start_test(mock.ANY, self.app, mock_g.db, repository, test, mock.ANY)
         mock_g.db.commit.assert_called_once()
         mock_create_instance.assert_called_once()
-        mock_wait_for_operation.assert_called_once()
 
     @mock.patch('github.Github.get_repo')
     @mock.patch('mod_ci.controllers.start_test')
@@ -1479,14 +1478,13 @@ def test_add_test_entry_db_commit_failure(
         mock_safe_commit.assert_called_once()
         mock_log.error.assert_called()
 
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.delete_instance')
     @mock.patch('mod_ci.controllers.safe_db_commit')
     @mock.patch('mod_ci.controllers.is_instance_testing')
     @mock.patch('run.log')
     def test_delete_expired_instances_db_commit_failure(
             self, mock_log, mock_is_testing, mock_safe_commit,
-            mock_delete, mock_wait):
+            mock_delete):
         """Test delete_expired_instances handles db commit failure."""
         from mod_ci.controllers import delete_expired_instances
 
@@ -1769,13 +1767,12 @@ def test_equality_type_request_rto_none(self, mock_rto):
         mock_rto.query.filter.assert_called_once_with(mock_rto.id == 1)
         mock_log.info.assert_called_once()
 
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.delete_instance')
     @mock.patch('mod_ci.controllers.get_compute_service_object')
     @mock.patch('mod_ci.controllers.update_build_badge')
     @mock.patch('github.Github.get_repo')
     def test_progress_type_request(self, mock_repo, mock_update_build_badge, mock_get_compute_service_object,
-                                   mock_delete_instance, mock_wait_for_operation):
+                                   mock_delete_instance):
         """Test progress_type_request function."""
         from mod_ci.models import GcpInstance
         from run import log
@@ -2113,7 +2110,6 @@ def test_gcp_instance_unexpected_exception(self, mock_log, mock_g, mock_get_comp
         # Should log error and continue
         mock_log.error.assert_called()
 
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.create_instance')
     @mock.patch('builtins.open', new_callable=mock.mock_open())
     @mock.patch('mod_ci.controllers.g')
@@ -2122,8 +2118,7 @@ def test_gcp_instance_unexpected_exception(self, mock_log, mock_g, mock_get_comp
     @mock.patch('run.log')
     def test_start_test_duplicate_instance_check(
             self, mock_log, mock_gcp_instance, mock_test_progress,
-            mock_g, mock_open_file, mock_create_instance,
-            mock_wait_for_operation):
+            mock_g, mock_open_file, mock_create_instance):
         """Test start_test skips if GCP instance already exists for test."""
         from mod_ci.controllers import start_test
 
@@ -2140,7 +2135,6 @@ def test_start_test_duplicate_instance_check(
         mock_log.warning.assert_called()
         mock_create_instance.assert_not_called()
 
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.create_instance')
     @mock.patch('builtins.open', new_callable=mock.mock_open())
     @mock.patch('mod_ci.controllers.g')
@@ -2149,8 +2143,7 @@ def test_start_test_duplicate_instance_check(
     @mock.patch('run.log')
     def test_start_test_duplicate_progress_check(
             self, mock_log, mock_gcp_instance, mock_test_progress,
-            mock_g, mock_open_file, mock_create_instance,
-            mock_wait_for_operation):
+            mock_g, mock_open_file, mock_create_instance):
         """Test start_test skips if test already has progress entries."""
         from mod_ci.controllers import start_test
 
@@ -2170,7 +2163,6 @@ def test_start_test_duplicate_progress_check(
         mock_create_instance.assert_not_called()
 
     @mock.patch('mod_ci.controllers.mark_test_failed')
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.create_instance')
     @mock.patch('builtins.open', new_callable=mock.mock_open())
     @mock.patch('mod_ci.controllers.g')
@@ -2181,8 +2173,7 @@ def test_start_test_duplicate_progress_check(
     def test_start_test_artifact_timeout(
             self, mock_requests_get, mock_log, mock_gcp_instance,
             mock_test_progress, mock_g, mock_open_file,
-            mock_create_instance, mock_wait_for_operation,
-            mock_mark_failed):
+            mock_create_instance, mock_mark_failed):
         """Test start_test handles artifact download timeout."""
         import requests
         from github.Artifact import Artifact
@@ -2219,7 +2210,6 @@ def test_start_test_artifact_timeout(
         mock_create_instance.assert_not_called()
 
     @mock.patch('mod_ci.controllers.mark_test_failed')
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.create_instance')
     @mock.patch('builtins.open', new_callable=mock.mock_open())
     @mock.patch('mod_ci.controllers.g')
@@ -2230,8 +2220,7 @@ def test_start_test_artifact_timeout(
     def test_start_test_artifact_http_error(
             self, mock_requests_get, mock_log, mock_gcp_instance,
             mock_test_progress, mock_g, mock_open_file,
-            mock_create_instance, mock_wait_for_operation,
-            mock_mark_failed):
+            mock_create_instance, mock_mark_failed):
         """Test start_test handles artifact download HTTP errors."""
         import requests
         from github.Artifact import Artifact
@@ -2333,13 +2322,12 @@ def test_wait_for_operation_success(self, mock_time, mock_sleep, mock_log):
         mock_log.info.assert_called()
 
     @mock.patch('mod_ci.controllers.Github')
-    @mock.patch('mod_ci.controllers.wait_for_operation')
     @mock.patch('mod_ci.controllers.delete_instance')
     @mock.patch('mod_ci.controllers.get_compute_service_object')
     @mock.patch('mod_ci.controllers.update_build_badge')
     def test_progress_type_request_empty_token(
             self, mock_update_build_badge, mock_get_compute_service_object,
-            mock_delete_instance, mock_wait_for_operation, mock_github):
+            mock_delete_instance, mock_github):
         """Test progress_type_request returns True when GitHub token is empty."""
         from mod_ci.models import GcpInstance
         from run import log

From 6ad1f31293198c6886a5c64d2fc0a3ae2c131f2f Mon Sep 17 00:00:00 2001
From: Carlos <carlos@sanz.dev>
Date: Tue, 30 Dec 2025 17:40:13 +0100
Subject: [PATCH 2/2] feat: Add VM deletion tracking and verification system
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements a robust system to ensure GCP VMs are properly deleted:

- Add PendingDeletion model to track deletion operations
- Add delete_instance_with_tracking() to record pending deletions
- Add verify_pending_deletions() to check operation status and retry failures
- Add scan_for_orphaned_vms() as final safety net for missed deletions
- Integrate verification into start_platforms() cron flow
- Update both deletion points to use tracking
- Add database migration for pending_deletion table
- Add 11 tests for new functionality

This prevents orphaned VMs from continuing to incur billing charges
when deletion operations fail silently.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 migrations/versions/c8f3a2b1d4e5_.py |  34 +++
 mod_ci/controllers.py                | 303 +++++++++++++++++++++++++--
 mod_ci/models.py                     |  37 ++++
 tests/test_ci/test_controllers.py    | 252 ++++++++++++++++++++++
 4 files changed, 614 insertions(+), 12 deletions(-)
 create mode 100644 migrations/versions/c8f3a2b1d4e5_.py

diff --git a/migrations/versions/c8f3a2b1d4e5_.py b/migrations/versions/c8f3a2b1d4e5_.py
new file mode 100644
index 00000000..436d06e0
--- /dev/null
+++ b/migrations/versions/c8f3a2b1d4e5_.py
@@ -0,0 +1,34 @@
+"""Add pending_deletion table for VM deletion tracking
+
+Revision ID: c8f3a2b1d4e5
+Revises: 7793881905c5
+Create Date: 2025-12-30 12:00:00.000000
+
+"""
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = 'c8f3a2b1d4e5'
+down_revision = '7793881905c5'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('pending_deletion',
+        sa.Column('vm_name', sa.String(length=64), nullable=False),
+        sa.Column('operation_name', sa.String(length=128), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('retry_count', sa.Integer(), nullable=False),
+        sa.PrimaryKeyConstraint('vm_name'),
+        mysql_engine='InnoDB'
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table('pending_deletion')
+    # ### end Alembic commands ###
diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py
index 24997472..81f6ddb0 100755
--- a/mod_ci/controllers.py
+++ b/mod_ci/controllers.py
@@ -34,7 +34,8 @@
 from mod_auth.models import Role
 from mod_ci.forms import AddUsersToBlacklist, DeleteUserForm
 from mod_ci.models import (BlockedUsers, CategoryTestInfo, GcpInstance,
-                           MaintenanceMode, PrCommentInfo, Status)
+                           MaintenanceMode, PendingDeletion, PrCommentInfo,
+                           Status)
 from mod_customized.models import CustomizedTest
 from mod_home.models import CCExtractorVersion, GeneralData
 from mod_regression.models import (Category, RegressionTest,
@@ -386,6 +387,14 @@ def start_platforms(repository, delay=None, platform=None) -> None:
         db = create_session(config.get('DATABASE_URI', ''))
 
         compute = get_compute_service_object()
+
+        # Step 1: Verify any pending deletions from previous runs
+        verify_pending_deletions(compute, project, zone, db)
+
+        # Step 2: Scan for orphaned VMs that weren't properly deleted
+        scan_for_orphaned_vms(compute, project, zone, db)
+
+        # Step 3: Delete expired instances (tests that ran too long)
         delete_expired_instances(compute, vm_max_runtime, project, zone, db, repository)
 
         if platform is None or platform == TestPlatform.linux:
@@ -463,12 +472,15 @@ def delete_expired_instances(compute, max_runtime, project, zone, db, repository
                 if gh_commit is not None:
                     update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {platform_name}")
 
-                # Delete VM instance (fire-and-forget to avoid blocking workers)
-                # The deletion will complete eventually - we don't need confirmation
-                operation = delete_instance(compute, project, zone, vm_name)
+                # Delete VM instance with tracking for verification
                 from run import log
-                op_name = operation.get('name', 'unknown')
-                log.info(f"Expired instance deletion initiated for {vm_name} (op: {op_name})")
+                try:
+                    operation = delete_instance_with_tracking(
+                        compute, project, zone, vm_name, db)
+                    op_name = operation.get('name', 'unknown')
+                    log.info(f"Expired instance deletion initiated for {vm_name} (op: {op_name})")
+                except Exception as e:
+                    log.error(f"Failed to delete expired instance {vm_name}: {e}")
 
 
 def gcp_instance(app, db, platform, repository, delay) -> None:
@@ -1004,6 +1016,269 @@ def delete_instance(compute, project, zone, vm_name) -> Dict:
         instance=vm_name).execute()
 
 
+def delete_instance_with_tracking(compute, project, zone, vm_name, db) -> Dict:
+    """
+    Delete the GCP instance and track the operation for verification.
+
+    This function wraps delete_instance() and records the pending deletion
+    in the database so it can be verified later. If the deletion fails,
+    the verify_pending_deletions() cron job will retry it.
+
+    :param compute: The cloud compute engine service object
+    :type compute: googleapiclient.discovery.Resource
+    :param project: The GCP project name
+    :type project: str
+    :param zone: Zone for the new VM instance
+    :type zone: str
+    :param vm_name: Name of the instance to be deleted
+    :type vm_name: str
+    :param db: Database session for recording the pending deletion
+    :type db: sqlalchemy.orm.scoping.scoped_session
+    :return: Delete operation details after VM deletion
+    :rtype: Dict
+    """
+    from run import log
+
+    try:
+        operation = delete_instance(compute, project, zone, vm_name)
+        op_name = operation.get('name', 'unknown')
+
+        # Record the pending deletion for verification
+        pending = PendingDeletion(vm_name, op_name)
+        db.add(pending)
+        if not safe_db_commit(db, f"recording pending deletion for {vm_name}"):
+            log.warning(f"Failed to record pending deletion for {vm_name}, "
+                        "deletion may not be verified")
+
+        return operation
+
+    except Exception as e:
+        log.error(f"Failed to initiate deletion for {vm_name}: {e}")
+        # Still record it so we can retry later
+        pending = PendingDeletion(vm_name, f"failed-{vm_name}")
+        db.add(pending)
+        safe_db_commit(db, f"recording failed deletion attempt for {vm_name}")
+        raise
+
+
+def check_operation_status(compute, project, zone, operation_name) -> Dict:
+    """
+    Check the status of a GCP operation (non-blocking).
+
+    :param compute: The cloud compute engine service object
+    :type compute: googleapiclient.discovery.Resource
+    :param project: The GCP project name
+    :type project: str
+    :param zone: Zone for the operation
+    :type zone: str
+    :param operation_name: The operation name to check
+    :type operation_name: str
+    :return: Operation status dict with 'status' key (RUNNING, DONE, or ERROR)
+    :rtype: Dict
+    """
+    try:
+        result = compute.zoneOperations().get(
+            project=project,
+            zone=zone,
+            operation=operation_name).execute()
+        return result
+    except Exception as e:
+        return {'status': 'ERROR', 'error': {'message': str(e)}}
+
+
+def verify_pending_deletions(compute, project, zone, db) -> None:
+    """
+    Verify pending VM deletions and retry failed ones.
+
+    This function checks the status of all pending deletion operations.
+    If an operation completed successfully, it removes the tracking entry.
+    If an operation failed, it retries the deletion (up to MAX_RETRIES).
+
+    :param compute: The cloud compute engine service object
+    :type compute: googleapiclient.discovery.Resource
+    :param project: The GCP project name
+    :type project: str
+    :param zone: Zone for the VM instances
+    :type zone: str
+    :param db: Database session
+    :type db: sqlalchemy.orm.scoping.scoped_session
+    """
+    from run import log
+
+    pending_deletions = PendingDeletion.query.all()
+    if not pending_deletions:
+        return
+
+    log.info(f"Verifying {len(pending_deletions)} pending VM deletions")
+
+    for pending in pending_deletions:
+        vm_name = pending.vm_name
+        op_name = pending.operation_name
+
+        # Check if this was a failed initial deletion attempt
+        if op_name.startswith('failed-'):
+            # Retry the deletion
+            log.info(f"Retrying failed deletion for {vm_name}")
+            _retry_deletion(compute, project, zone, vm_name, pending, db, log)
+            continue
+
+        # Check the operation status
+        result = check_operation_status(compute, project, zone, op_name)
+        status = result.get('status', 'UNKNOWN')
+
+        if status == 'DONE':
+            # Check if operation had errors
+            if 'error' in result:
+                error_msg = result['error'].get('errors', [{}])[0].get('message', 'Unknown error')
+                log.warning(f"Deletion operation for {vm_name} completed with error: {error_msg}")
+                _retry_deletion(compute, project, zone, vm_name, pending, db, log)
+            else:
+                # Success! Remove the tracking entry
+                log.info(f"VM {vm_name} deletion confirmed (op: {op_name})")
+                db.delete(pending)
+                safe_db_commit(db, f"removing confirmed deletion for {vm_name}")
+
+        elif status == 'RUNNING':
+            # Still in progress - check how long it's been
+            age_minutes = (datetime.datetime.now() - pending.created_at).total_seconds() / 60
+            if age_minutes > 30:
+                # Operation running too long, might be stuck
+                log.warning(f"Deletion operation for {vm_name} running for {age_minutes:.0f} min, retrying")
+                _retry_deletion(compute, project, zone, vm_name, pending, db, log)
+            else:
+                log.debug(f"Deletion operation for {vm_name} still running ({age_minutes:.0f} min)")
+
+        else:
+            # Unknown status or error checking - retry
+            log.warning(f"Unexpected status '{status}' for {vm_name} deletion, retrying")
+            _retry_deletion(compute, project, zone, vm_name, pending, db, log)
+
+
+def _retry_deletion(compute, project, zone, vm_name, pending, db, log) -> None:
+    """
+    Retry a failed VM deletion.
+
+    :param compute: The cloud compute engine service object
+    :param project: The GCP project name
+    :param zone: Zone for the VM instance
+    :param vm_name: Name of the VM to delete
+    :param pending: The PendingDeletion record
+    :param db: Database session
+    :param log: Logger instance
+    """
+    if pending.retry_count >= PendingDeletion.MAX_RETRIES:
+        log.error(f"Max retries ({PendingDeletion.MAX_RETRIES}) exceeded for {vm_name}, "
+                  "VM may still be running and incurring charges!")
+        # Keep the record so scan_for_orphaned_vms can handle it
+        return
+
+    pending.retry_count += 1
+    log.info(f"Retry {pending.retry_count}/{PendingDeletion.MAX_RETRIES} for {vm_name}")
+
+    try:
+        operation = delete_instance(compute, project, zone, vm_name)
+        pending.operation_name = operation.get('name', f'retry-{vm_name}')
+        pending.created_at = datetime.datetime.now()
+        safe_db_commit(db, f"updating retry for {vm_name}")
+        log.info(f"Retry deletion initiated for {vm_name} (op: {pending.operation_name})")
+    except Exception as e:
+        error_str = str(e)
+        if 'notFound' in error_str or '404' in error_str:
+            # VM doesn't exist anymore - success!
+            log.info(f"VM {vm_name} no longer exists, deletion confirmed")
+            db.delete(pending)
+            safe_db_commit(db, f"removing deletion for non-existent {vm_name}")
+        else:
+            log.error(f"Retry deletion failed for {vm_name}: {e}")
+            safe_db_commit(db, f"recording retry attempt for {vm_name}")
+
+
+def scan_for_orphaned_vms(compute, project, zone, db) -> None:
+    """
+    Scan for orphaned VMs that should have been deleted but weren't.
+
+    This is the final safety net. It finds VMs that:
+    1. Match our naming pattern (platform-testid)
+    2. Don't have a corresponding GcpInstance record (test finished but VM wasn't deleted)
+    3. Have a PendingDeletion record with max retries exceeded
+
+    :param compute: The cloud compute engine service object
+    :type compute: googleapiclient.discovery.Resource
+    :param project: The GCP project name
+    :type project: str
+    :param zone: Zone for the VM instances
+    :type zone: str
+    :param db: Database session
+    :type db: sqlalchemy.orm.scoping.scoped_session
+    """
+    from run import log
+
+    try:
+        running_vms = get_running_instances(compute, project, zone)
+    except Exception as e:
+        log.error(f"Failed to list running instances: {e}")
+        return
+
+    if not running_vms:
+        return
+
+    orphaned_count = 0
+    for vm in running_vms:
+        vm_name = vm['name']
+
+        # Only check VMs that match our naming pattern
+        if not is_instance_testing(vm_name):
+            continue
+
+        # Check if there's a GcpInstance record (meaning test is still running)
+        gcp_instance = GcpInstance.query.filter(GcpInstance.name == vm_name).first()
+        if gcp_instance is not None:
+            # VM is legitimately running
+            continue
+
+        # No GcpInstance record - this VM should have been deleted
+        # Check if we're already tracking it
+        pending = PendingDeletion.query.filter(
+            PendingDeletion.vm_name == vm_name).first()
+
+        if pending is not None:
+            if pending.retry_count >= PendingDeletion.MAX_RETRIES:
+                # Max retries exceeded - force delete
+                log.warning(f"Orphan scan: Force deleting {vm_name} after max retries")
+                try:
+                    operation = delete_instance(compute, project, zone, vm_name)
+                    # Reset retry count and update operation
+                    pending.retry_count = 0
+                    pending.operation_name = operation.get('name', f'force-{vm_name}')
+                    pending.created_at = datetime.datetime.now()
+                    safe_db_commit(db, f"force delete for orphan {vm_name}")
+                    orphaned_count += 1
+                except Exception as e:
+                    if 'notFound' in str(e) or '404' in str(e):
+                        db.delete(pending)
+                        safe_db_commit(db, f"removing stale pending for {vm_name}")
+                    else:
+                        log.error(f"Force delete failed for {vm_name}: {e}")
+            # else: still being handled by verify_pending_deletions
+        else:
+            # Not tracked at all - orphaned VM found
+            log.warning(f"Orphan scan: Found untracked VM {vm_name}, initiating deletion")
+            try:
+                operation = delete_instance(compute, project, zone, vm_name)
+                op_name = operation.get('name', f'orphan-{vm_name}')
+                # Track it for verification
+                new_pending = PendingDeletion(vm_name, op_name)
+                db.add(new_pending)
+                safe_db_commit(db, f"tracking orphan deletion for {vm_name}")
+                orphaned_count += 1
+            except Exception as e:
+                if 'notFound' not in str(e) and '404' not in str(e):
+                    log.error(f"Failed to delete orphan VM {vm_name}: {e}")
+
+    if orphaned_count > 0:
+        log.info(f"Orphan scan: Initiated deletion for {orphaned_count} orphaned VMs")
+
+
 def get_config_for_gcp_instance(vm_name, source_disk_image, metadata_items) -> Dict:
     """
     Get VM config for new VM instance.
@@ -1972,19 +2247,23 @@ def update_final_status():
         log.error(f"Test {test_id}: Failed to update final GitHub status after retries: {e}")
 
     if status in [TestStatus.completed, TestStatus.canceled]:
-        # Delete the current instance (fire-and-forget)
-        # We intentionally don't wait for the deletion to complete because:
+        # Delete the current instance with tracking for verification
+        # We don't wait for the deletion to complete because:
         # 1. Waiting can take 60+ seconds, exceeding nginx/gunicorn timeouts (502 errors)
-        # 2. The deletion will complete eventually - we don't need confirmation
+        # 2. The verify_pending_deletions() cron job will confirm deletion succeeded
         # 3. All important work (test results, GitHub status) is already done
         from run import config
         compute = get_compute_service_object()
         zone = config.get('ZONE', '')
         project = config.get('PROJECT_NAME', '')
         vm_name = f"{test.platform.value}-{test.id}"
-        operation = delete_instance(compute, project, zone, vm_name)
-        op_name = operation.get('name', 'unknown')
-        log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (op: {op_name})")
+        try:
+            operation = delete_instance_with_tracking(
+                compute, project, zone, vm_name, g.db)
+            op_name = operation.get('name', 'unknown')
+            log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (op: {op_name})")
+        except Exception as e:
+            log.error(f"[Test: {test_id}] Failed to delete VM {vm_name}: {e}")
 
     # If status is complete, remove the GCP Instance entry
     if status in [TestStatus.completed, TestStatus.canceled]:
diff --git a/mod_ci/models.py b/mod_ci/models.py
index 02619a25..1b5c9ebf 100644
--- a/mod_ci/models.py
+++ b/mod_ci/models.py
@@ -4,6 +4,7 @@
 List of models corresponding to mysql tables:
 [
     'Gcp Instance' => 'gcp_instance',
+    'Pending Deletion' => 'pending_deletion',
     'Maintenance mode' => 'maintenance_mode'
 ]
 """
@@ -79,6 +80,42 @@ def __repr__(self) -> str:
         return f'<GcpInstance test running: {self.test_id}>'
 
 
+class PendingDeletion(Base):
+    """Model to track pending VM deletion operations for verification."""
+
+    __tablename__ = 'pending_deletion'
+    __table_args__ = {'mysql_engine': 'InnoDB'}
+    vm_name = Column(String(64), primary_key=True)
+    operation_name = Column(String(128), nullable=False)
+    created_at = Column(DateTime(), nullable=False)
+    retry_count = Column(Integer, nullable=False, default=0)
+
+    # Max retries before we give up and just try to force delete
+    MAX_RETRIES = 5
+
+    def __init__(self, vm_name, operation_name, created_at=None) -> None:
+        """
+        Parametrized constructor for the PendingDeletion model.
+
+        :param vm_name: The name of the VM being deleted
+        :type vm_name: str
+        :param operation_name: The GCP operation name/ID for tracking
+        :type operation_name: str
+        :param created_at: When the deletion was initiated (None for now)
+        :type created_at: datetime
+        """
+        self.vm_name = vm_name
+        self.operation_name = operation_name
+        if created_at is None:
+            created_at = datetime.datetime.now()
+        self.created_at = created_at
+        self.retry_count = 0
+
+    def __repr__(self) -> str:
+        """Represent a PendingDeletion by its vm_name."""
+        return f'<PendingDeletion vm={self.vm_name} op={self.operation_name} retries={self.retry_count}>'
+
+
 class MaintenanceMode(Base):
     """Model to maintain maintenance status of platforms."""
 
diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py
index ac48ed0a..f1c4ba74 100644
--- a/tests/test_ci/test_controllers.py
+++ b/tests/test_ci/test_controllers.py
@@ -3546,3 +3546,255 @@ def test_permanent_failure_marks_test_failed(
         mock_mark_failed.assert_called_once()
         # Should log critical
         mock_log.critical.assert_called()
+
+
+class TestPendingDeletionTracking(BaseTestCase):
+    """Tests for VM deletion tracking and verification functionality."""
+
+    @mock.patch('run.log')
+    def test_check_operation_status_success(self, mock_log):
+        """Test check_operation_status returns status correctly."""
+        from mod_ci.controllers import check_operation_status
+
+        compute = MagicMock()
+        zone_ops = compute.zoneOperations.return_value.get.return_value
+        zone_ops.execute.return_value = {'status': 'DONE'}
+
+        result = check_operation_status(compute, "project", "zone", "op-123")
+
+        self.assertEqual(result['status'], 'DONE')
+
+    @mock.patch('run.log')
+    def test_check_operation_status_error(self, mock_log):
+        """Test check_operation_status handles API errors."""
+        from mod_ci.controllers import check_operation_status
+
+        compute = MagicMock()
+        zone_ops = compute.zoneOperations.return_value.get.return_value
+        zone_ops.execute.side_effect = Exception("API Error")
+
+        result = check_operation_status(compute, "project", "zone", "op-123")
+
+        self.assertEqual(result['status'], 'ERROR')
+        self.assertIn('error', result)
+
+    @mock.patch('mod_ci.controllers.safe_db_commit')
+    @mock.patch('mod_ci.controllers.delete_instance')
+    @mock.patch('run.log')
+    def test_delete_instance_with_tracking_success(
+            self, mock_log, mock_delete, mock_commit):
+        """Test delete_instance_with_tracking records pending deletion."""
+        from mod_ci.controllers import delete_instance_with_tracking
+        from mod_ci.models import PendingDeletion
+
+        mock_delete.return_value = {'name': 'op-123'}
+        mock_commit.return_value = True
+
+        db = MagicMock()
+        result = delete_instance_with_tracking(
+            MagicMock(), "project", "zone", "linux-42", db)
+
+        self.assertEqual(result['name'], 'op-123')
+        # Verify PendingDeletion was added
+        db.add.assert_called_once()
+        added_obj = db.add.call_args[0][0]
+        self.assertIsInstance(added_obj, PendingDeletion)
+        self.assertEqual(added_obj.vm_name, 'linux-42')
+        self.assertEqual(added_obj.operation_name, 'op-123')
+
+    @mock.patch('mod_ci.controllers.safe_db_commit')
+    @mock.patch('mod_ci.controllers.delete_instance')
+    @mock.patch('run.log')
+    def test_delete_instance_with_tracking_failure(
+            self, mock_log, mock_delete, mock_commit):
+        """Test delete_instance_with_tracking handles failures."""
+        from mod_ci.controllers import delete_instance_with_tracking
+        from mod_ci.models import PendingDeletion
+
+        mock_delete.side_effect = Exception("GCP Error")
+        mock_commit.return_value = True
+
+        db = MagicMock()
+        with self.assertRaises(Exception):
+            delete_instance_with_tracking(
+                MagicMock(), "project", "zone", "linux-42", db)
+
+        # Should still record the failed attempt
+        db.add.assert_called_once()
+        added_obj = db.add.call_args[0][0]
+        self.assertIsInstance(added_obj, PendingDeletion)
+        self.assertTrue(added_obj.operation_name.startswith('failed-'))
+
+    @mock.patch('mod_ci.controllers.PendingDeletion')
+    @mock.patch('mod_ci.controllers.check_operation_status')
+    @mock.patch('mod_ci.controllers.safe_db_commit')
+    @mock.patch('run.log')
+    def test_verify_pending_deletions_success(
+            self, mock_log, mock_commit, mock_check, mock_pending_class):
+        """Test verify_pending_deletions removes successful deletions."""
+        from mod_ci.controllers import verify_pending_deletions
+
+        # Create mock pending deletion
+        mock_pending = MagicMock()
+        mock_pending.vm_name = 'linux-42'
+        mock_pending.operation_name = 'op-123'
+        mock_pending.retry_count = 0
+        mock_pending_class.query.all.return_value = [mock_pending]
+
+        # Operation completed successfully
+        mock_check.return_value = {'status': 'DONE'}
+        mock_commit.return_value = True
+
+        db = MagicMock()
+        verify_pending_deletions(MagicMock(), "project", "zone", db)
+
+        # Should delete the pending record
+        db.delete.assert_called_once_with(mock_pending)
+        mock_commit.assert_called()
+
+    @mock.patch('mod_ci.controllers.PendingDeletion')
+    @mock.patch('mod_ci.controllers.check_operation_status')
+    @mock.patch('mod_ci.controllers._retry_deletion')
+    @mock.patch('run.log')
+    def test_verify_pending_deletions_retries_on_error(
+            self, mock_log, mock_retry, mock_check, mock_pending_class):
+        """Test verify_pending_deletions retries failed deletions."""
+        from mod_ci.controllers import verify_pending_deletions
+
+        # Create mock pending deletion
+        mock_pending = MagicMock()
+        mock_pending.vm_name = 'linux-42'
+        mock_pending.operation_name = 'op-123'
+        mock_pending.retry_count = 0
+        mock_pending_class.query.all.return_value = [mock_pending]
+
+        # Operation failed
+        mock_check.return_value = {
+            'status': 'DONE',
+            'error': {'errors': [{'message': 'Permission denied'}]}
+        }
+
+        db = MagicMock()
+        verify_pending_deletions(MagicMock(), "project", "zone", db)
+
+        # Should retry
+        mock_retry.assert_called_once()
+
+    @mock.patch('mod_ci.controllers.GcpInstance')
+    @mock.patch('mod_ci.controllers.get_running_instances')
+    @mock.patch('mod_ci.controllers.is_instance_testing')
+    @mock.patch('mod_ci.controllers.delete_instance')
+    @mock.patch('mod_ci.controllers.safe_db_commit')
+    @mock.patch('run.log')
+    def test_scan_for_orphaned_vms_finds_orphan(
+            self, mock_log, mock_commit, mock_delete, mock_is_testing,
+            mock_get_running, mock_gcp_class):
+        """Test scan_for_orphaned_vms finds and deletes orphaned VMs."""
+        from mod_ci.controllers import scan_for_orphaned_vms
+        from mod_ci.models import PendingDeletion
+
+        # Mock running VM
+        mock_get_running.return_value = [{'name': 'linux-42'}]
+        mock_is_testing.return_value = True
+
+        # No GcpInstance record (orphaned)
+        mock_gcp_class.query.filter.return_value.first.return_value = None
+        # Not tracked in PendingDeletion - use real query
+        with mock.patch.object(PendingDeletion, 'query') as mock_query:
+            mock_query.filter.return_value.first.return_value = None
+
+            mock_delete.return_value = {'name': 'op-orphan-42'}
+            mock_commit.return_value = True
+
+            db = MagicMock()
+            scan_for_orphaned_vms(MagicMock(), "project", "zone", db)
+
+            # Should delete the orphan
+            mock_delete.assert_called_once()
+            # Should track it
+            db.add.assert_called_once()
+            added_obj = db.add.call_args[0][0]
+            self.assertIsInstance(added_obj, PendingDeletion)
+            self.assertEqual(added_obj.vm_name, 'linux-42')
+
+    @mock.patch('mod_ci.controllers.PendingDeletion')
+    @mock.patch('mod_ci.controllers.GcpInstance')
+    @mock.patch('mod_ci.controllers.get_running_instances')
+    @mock.patch('mod_ci.controllers.is_instance_testing')
+    @mock.patch('run.log')
+    def test_scan_for_orphaned_vms_ignores_active(
+            self, mock_log, mock_is_testing, mock_get_running,
+            mock_gcp_class, mock_pending_class):
+        """Test scan_for_orphaned_vms ignores VMs with GcpInstance records."""
+        from mod_ci.controllers import scan_for_orphaned_vms
+
+        # Mock running VM
+        mock_get_running.return_value = [{'name': 'linux-42'}]
+        mock_is_testing.return_value = True
+
+        # Has GcpInstance record (active test)
+        mock_gcp_class.query.filter.return_value.first.return_value = MagicMock()
+
+        db = MagicMock()
+        scan_for_orphaned_vms(MagicMock(), "project", "zone", db)
+
+        # Should NOT track or delete
+        db.add.assert_not_called()
+
+    @mock.patch('mod_ci.controllers.delete_instance')
+    @mock.patch('mod_ci.controllers.safe_db_commit')
+    @mock.patch('run.log')
+    def test_retry_deletion_success(self, mock_log, mock_commit, mock_delete):
+        """Test _retry_deletion successfully retries."""
+        from mod_ci.controllers import _retry_deletion
+        from mod_ci.models import PendingDeletion
+
+        mock_delete.return_value = {'name': 'op-retry-123'}
+        mock_commit.return_value = True
+
+        pending = PendingDeletion('linux-42', 'op-failed')
+        pending.retry_count = 0
+
+        db = MagicMock()
+        _retry_deletion(MagicMock(), "project", "zone", "linux-42", pending, db, mock_log)
+
+        mock_delete.assert_called_once()
+        self.assertEqual(pending.retry_count, 1)
+        self.assertEqual(pending.operation_name, 'op-retry-123')
+
+    @mock.patch('mod_ci.controllers.delete_instance')
+    @mock.patch('run.log')
+    def test_retry_deletion_max_retries(self, mock_log, mock_delete):
+        """Test _retry_deletion stops after max retries."""
+        from mod_ci.controllers import _retry_deletion
+        from mod_ci.models import PendingDeletion
+
+        pending = PendingDeletion('linux-42', 'op-failed')
+        pending.retry_count = PendingDeletion.MAX_RETRIES
+
+        db = MagicMock()
+        _retry_deletion(MagicMock(), "project", "zone", "linux-42", pending, db, mock_log)
+
+        # Should NOT attempt delete
+        mock_delete.assert_not_called()
+        mock_log.error.assert_called()
+
+    @mock.patch('mod_ci.controllers.delete_instance')
+    @mock.patch('mod_ci.controllers.safe_db_commit')
+    @mock.patch('run.log')
+    def test_retry_deletion_vm_not_found(self, mock_log, mock_commit, mock_delete):
+        """Test _retry_deletion handles 404 (VM already deleted)."""
+        from mod_ci.controllers import _retry_deletion
+        from mod_ci.models import PendingDeletion
+
+        mock_delete.side_effect = Exception("notFound: VM not found")
+        mock_commit.return_value = True
+
+        pending = PendingDeletion('linux-42', 'op-failed')
+        pending.retry_count = 0
+
+        db = MagicMock()
+        _retry_deletion(MagicMock(), "project", "zone", "linux-42", pending, db, mock_log)
+
+        # Should remove the pending record (VM is gone)
+        db.delete.assert_called_once_with(pending)