From 0e2be0c3f9a6109aa4e3e997e4792d24cc2d6ef6 Mon Sep 17 00:00:00 2001 From: Carlos Date: Tue, 30 Dec 2025 09:59:58 +0100 Subject: [PATCH 1/2] fix: Remove remaining blocking wait_for_operation calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR removes the last two blocking wait_for_operation calls that were causing gunicorn workers to be blocked for extended periods: 1. delete_expired_instances() - VM deletion is now fire-and-forget 2. start_test() - VM creation is now optimistic (recorded immediately) These blocking calls were causing 504 timeouts on webhook deliveries because GitHub has a 10-second webhook timeout. When cron jobs were running wait_for_operation (which can take up to 30 minutes), all gunicorn workers could become blocked, causing webhook requests to queue and exceed the timeout. Changes: - delete_expired_instances: Remove blocking wait, log operation initiation - start_test: Check for immediate errors, then record instance optimistically - Tests: Remove unused wait_for_operation mocks from affected tests If VM creation ultimately fails, the test won't report progress and will be cleaned up by the expired instances cron job. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- mod_ci/controllers.py | 34 +++++++++++++++++--------- tests/test_ci/test_controllers.py | 40 +++++++++++-------------------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 2066b87e..24997472 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -463,9 +463,12 @@ def delete_expired_instances(compute, max_runtime, project, zone, db, repository if gh_commit is not None: update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {platform_name}") - # Delete VM instance + # Delete VM instance (fire-and-forget to avoid blocking workers) + # The deletion will complete eventually - we don't need confirmation operation = delete_instance(compute, project, zone, vm_name) - wait_for_operation(compute, project, zone, operation['name']) + from run import log + op_name = operation.get('name', 'unknown') + log.info(f"Expired instance deletion initiated for {vm_name} (op: {op_name})") def gcp_instance(app, db, platform, repository, delay) -> None: @@ -902,16 +905,25 @@ def start_test(compute, app, db, repository: Repository.Repository, test, bot_to zone = config.get('ZONE', '') project_id = config.get('PROJECT_NAME', '') operation = create_instance(compute, project_id, zone, test, full_url) - result = wait_for_operation(compute, project_id, zone, operation['name']) - # Check if result indicates success (result is a dict with no 'error' key) - if isinstance(result, dict) and 'error' not in result: - db.add(status) - if not safe_db_commit(db, f"recording GCP instance for test {test.id}"): - log.error(f"Failed to record GCP instance for test {test.id}, but VM was created") - else: - error_msg = parse_gcp_error(result) - log.error(f"Error creating test instance for test {test.id}, result: {result}") + + # Check if the create_instance call itself returned an error (synchronous failure) + if 'error' in operation: + error_msg = parse_gcp_error(operation) + log.error(f"Error creating test instance for test {test.id}, result: {operation}") mark_test_failed(db, test, repository, error_msg) + return + + # VM creation request was accepted - record the instance optimistically + # We don't wait for the operation to complete because: + # 1. Waiting can take 60+ seconds, blocking gunicorn workers + # 2. If VM creation ultimately fails, the test won't report progress + # and will be cleaned up by the expired instances cron job + op_name = operation.get('name', 'unknown') + log.info(f"Test {test.id}: VM creation initiated (op: {op_name})") + + db.add(status) + if not safe_db_commit(db, f"recording GCP instance for test {test.id}"): + log.error(f"Failed to record GCP instance for test {test.id}, but VM creation was initiated") def create_instance(compute, project, zone, test, reportURL) -> Dict: diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py index 63a38332..ac48ed0a 100644 --- a/tests/test_ci/test_controllers.py +++ b/tests/test_ci/test_controllers.py @@ -215,14 +215,13 @@ def test_cron_job_empty_token(self, mock_log): cron() mock_log.error.assert_called_with('GITHUB_TOKEN not configured, cannot run CI cron') - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.create_instance') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('mod_ci.controllers.g') @mock.patch('mod_ci.controllers.TestProgress') @mock.patch('mod_ci.controllers.GcpInstance') def test_start_test(self, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, - mock_create_instance, mock_wait_for_operation): + mock_create_instance): """Test start_test function.""" import zipfile @@ -268,21 +267,21 @@ def extractall(*args, **kwargs): mock_query = create_mock_db_query(mock_g) mock_query.c.got = MagicMock() - # Test when gcp create instance fails - mock_wait_for_operation.return_value = {'status': 'DONE', 'error': {'errors': [{'code': 'TEST_ERROR'}]}} + # Test when gcp create instance fails synchronously (error in operation response) + mock_create_instance.return_value = {'error': {'errors': [{'code': 'TEST_ERROR'}]}} start_test(mock.ANY, self.app, mock_g.db, repository, test, mock.ANY) # Commit IS called to record the test failure in the database mock_g.db.commit.assert_called_once() mock_g.db.commit.reset_mock() mock_create_instance.reset_mock() - mock_wait_for_operation.reset_mock() - # Test when gcp create instance is successful - mock_wait_for_operation.return_value = {'status': 'DONE'} + # Test when gcp create instance is successful (no error in operation response) + # Note: We no longer wait for the operation to complete - we record the instance + # optimistically and let the expired instances cron handle failures + mock_create_instance.return_value = {'name': 'test-operation-123', 'status': 'RUNNING'} start_test(mock.ANY, self.app, mock_g.db, repository, test, mock.ANY) mock_g.db.commit.assert_called_once() mock_create_instance.assert_called_once() - mock_wait_for_operation.assert_called_once() @mock.patch('github.Github.get_repo') @mock.patch('mod_ci.controllers.start_test') @@ -1479,14 +1478,13 @@ def test_add_test_entry_db_commit_failure( mock_safe_commit.assert_called_once() mock_log.error.assert_called() - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.delete_instance') @mock.patch('mod_ci.controllers.safe_db_commit') @mock.patch('mod_ci.controllers.is_instance_testing') @mock.patch('run.log') def test_delete_expired_instances_db_commit_failure( self, mock_log, mock_is_testing, mock_safe_commit, - mock_delete, mock_wait): + mock_delete): """Test delete_expired_instances handles db commit failure.""" from mod_ci.controllers import delete_expired_instances @@ -1769,13 +1767,12 @@ def test_equality_type_request_rto_none(self, mock_rto): mock_rto.query.filter.assert_called_once_with(mock_rto.id == 1) mock_log.info.assert_called_once() - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.delete_instance') @mock.patch('mod_ci.controllers.get_compute_service_object') @mock.patch('mod_ci.controllers.update_build_badge') @mock.patch('github.Github.get_repo') def test_progress_type_request(self, mock_repo, mock_update_build_badge, mock_get_compute_service_object, - mock_delete_instance, mock_wait_for_operation): + mock_delete_instance): """Test progress_type_request function.""" from mod_ci.models import GcpInstance from run import log @@ -2113,7 +2110,6 @@ def test_gcp_instance_unexpected_exception(self, mock_log, mock_g, mock_get_comp # Should log error and continue mock_log.error.assert_called() - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.create_instance') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('mod_ci.controllers.g') @@ -2122,8 +2118,7 @@ def test_gcp_instance_unexpected_exception(self, mock_log, mock_g, mock_get_comp @mock.patch('run.log') def test_start_test_duplicate_instance_check( self, mock_log, mock_gcp_instance, mock_test_progress, - mock_g, mock_open_file, mock_create_instance, - mock_wait_for_operation): + mock_g, mock_open_file, mock_create_instance): """Test start_test skips if GCP instance already exists for test.""" from mod_ci.controllers import start_test @@ -2140,7 +2135,6 @@ def test_start_test_duplicate_instance_check( mock_log.warning.assert_called() mock_create_instance.assert_not_called() - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.create_instance') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('mod_ci.controllers.g') @@ -2149,8 +2143,7 @@ def test_start_test_duplicate_instance_check( @mock.patch('run.log') def test_start_test_duplicate_progress_check( self, mock_log, mock_gcp_instance, mock_test_progress, - mock_g, mock_open_file, mock_create_instance, - mock_wait_for_operation): + mock_g, mock_open_file, mock_create_instance): """Test start_test skips if test already has progress entries.""" from mod_ci.controllers import start_test @@ -2170,7 +2163,6 @@ def test_start_test_duplicate_progress_check( mock_create_instance.assert_not_called() @mock.patch('mod_ci.controllers.mark_test_failed') - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.create_instance') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('mod_ci.controllers.g') @@ -2181,8 +2173,7 @@ def test_start_test_duplicate_progress_check( def test_start_test_artifact_timeout( self, mock_requests_get, mock_log, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, - mock_create_instance, mock_wait_for_operation, - mock_mark_failed): + mock_create_instance, mock_mark_failed): """Test start_test handles artifact download timeout.""" import requests from github.Artifact import Artifact @@ -2219,7 +2210,6 @@ def test_start_test_artifact_timeout( mock_create_instance.assert_not_called() @mock.patch('mod_ci.controllers.mark_test_failed') - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.create_instance') @mock.patch('builtins.open', new_callable=mock.mock_open()) @mock.patch('mod_ci.controllers.g') @@ -2230,8 +2220,7 @@ def test_start_test_artifact_timeout( def test_start_test_artifact_http_error( self, mock_requests_get, mock_log, mock_gcp_instance, mock_test_progress, mock_g, mock_open_file, - mock_create_instance, mock_wait_for_operation, - mock_mark_failed): + mock_create_instance, mock_mark_failed): """Test start_test handles artifact download HTTP errors.""" import requests from github.Artifact import Artifact @@ -2333,13 +2322,12 @@ def test_wait_for_operation_success(self, mock_time, mock_sleep, mock_log): mock_log.info.assert_called() @mock.patch('mod_ci.controllers.Github') - @mock.patch('mod_ci.controllers.wait_for_operation') @mock.patch('mod_ci.controllers.delete_instance') @mock.patch('mod_ci.controllers.get_compute_service_object') @mock.patch('mod_ci.controllers.update_build_badge') def test_progress_type_request_empty_token( self, mock_update_build_badge, mock_get_compute_service_object, - mock_delete_instance, mock_wait_for_operation, mock_github): + mock_delete_instance, mock_github): """Test progress_type_request returns True when GitHub token is empty.""" from mod_ci.models import GcpInstance from run import log From 6ad1f31293198c6886a5c64d2fc0a3ae2c131f2f Mon Sep 17 00:00:00 2001 From: Carlos Date: Tue, 30 Dec 2025 17:40:13 +0100 Subject: [PATCH 2/2] feat: Add VM deletion tracking and verification system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements a robust system to ensure GCP VMs are properly deleted: - Add PendingDeletion model to track deletion operations - Add delete_instance_with_tracking() to record pending deletions - Add verify_pending_deletions() to check operation status and retry failures - Add scan_for_orphaned_vms() as final safety net for missed deletions - Integrate verification into start_platforms() cron flow - Update both deletion points to use tracking - Add database migration for pending_deletion table - Add 11 tests for new functionality This prevents orphaned VMs from continuing to incur billing charges when deletion operations fail silently. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- migrations/versions/c8f3a2b1d4e5_.py | 34 +++ mod_ci/controllers.py | 303 +++++++++++++++++++++++++-- mod_ci/models.py | 37 ++++ tests/test_ci/test_controllers.py | 252 ++++++++++++++++++++++ 4 files changed, 614 insertions(+), 12 deletions(-) create mode 100644 migrations/versions/c8f3a2b1d4e5_.py diff --git a/migrations/versions/c8f3a2b1d4e5_.py b/migrations/versions/c8f3a2b1d4e5_.py new file mode 100644 index 00000000..436d06e0 --- /dev/null +++ b/migrations/versions/c8f3a2b1d4e5_.py @@ -0,0 +1,34 @@ +"""Add pending_deletion table for VM deletion tracking + +Revision ID: c8f3a2b1d4e5 +Revises: 7793881905c5 +Create Date: 2025-12-30 12:00:00.000000 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = 'c8f3a2b1d4e5' +down_revision = '7793881905c5' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('pending_deletion', + sa.Column('vm_name', sa.String(length=64), nullable=False), + sa.Column('operation_name', sa.String(length=128), nullable=False), + sa.Column('created_at', sa.DateTime(), nullable=False), + sa.Column('retry_count', sa.Integer(), nullable=False), + sa.PrimaryKeyConstraint('vm_name'), + mysql_engine='InnoDB' + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('pending_deletion') + # ### end Alembic commands ### diff --git a/mod_ci/controllers.py b/mod_ci/controllers.py index 24997472..81f6ddb0 100755 --- a/mod_ci/controllers.py +++ b/mod_ci/controllers.py @@ -34,7 +34,8 @@ from mod_auth.models import Role from mod_ci.forms import AddUsersToBlacklist, DeleteUserForm from mod_ci.models import (BlockedUsers, CategoryTestInfo, GcpInstance, - MaintenanceMode, PrCommentInfo, Status) + MaintenanceMode, PendingDeletion, PrCommentInfo, + Status) from mod_customized.models import CustomizedTest from mod_home.models import CCExtractorVersion, GeneralData from mod_regression.models import (Category, RegressionTest, @@ -386,6 +387,14 @@ def start_platforms(repository, delay=None, platform=None) -> None: db = create_session(config.get('DATABASE_URI', '')) compute = get_compute_service_object() + + # Step 1: Verify any pending deletions from previous runs + verify_pending_deletions(compute, project, zone, db) + + # Step 2: Scan for orphaned VMs that weren't properly deleted + scan_for_orphaned_vms(compute, project, zone, db) + + # Step 3: Delete expired instances (tests that ran too long) delete_expired_instances(compute, vm_max_runtime, project, zone, db, repository) if platform is None or platform == TestPlatform.linux: @@ -463,12 +472,15 @@ def delete_expired_instances(compute, max_runtime, project, zone, db, repository if gh_commit is not None: update_status_on_github(gh_commit, Status.ERROR, message, f"CI - {platform_name}") - # Delete VM instance (fire-and-forget to avoid blocking workers) - # The deletion will complete eventually - we don't need confirmation - operation = delete_instance(compute, project, zone, vm_name) + # Delete VM instance with tracking for verification from run import log - op_name = operation.get('name', 'unknown') - log.info(f"Expired instance deletion initiated for {vm_name} (op: {op_name})") + try: + operation = delete_instance_with_tracking( + compute, project, zone, vm_name, db) + op_name = operation.get('name', 'unknown') + log.info(f"Expired instance deletion initiated for {vm_name} (op: {op_name})") + except Exception as e: + log.error(f"Failed to delete expired instance {vm_name}: {e}") def gcp_instance(app, db, platform, repository, delay) -> None: @@ -1004,6 +1016,269 @@ def delete_instance(compute, project, zone, vm_name) -> Dict: instance=vm_name).execute() +def delete_instance_with_tracking(compute, project, zone, vm_name, db) -> Dict: + """ + Delete the GCP instance and track the operation for verification. + + This function wraps delete_instance() and records the pending deletion + in the database so it can be verified later. If the deletion fails, + the verify_pending_deletions() cron job will retry it. + + :param compute: The cloud compute engine service object + :type compute: googleapiclient.discovery.Resource + :param project: The GCP project name + :type project: str + :param zone: Zone for the new VM instance + :type zone: str + :param vm_name: Name of the instance to be deleted + :type vm_name: str + :param db: Database session for recording the pending deletion + :type db: sqlalchemy.orm.scoping.scoped_session + :return: Delete operation details after VM deletion + :rtype: Dict + """ + from run import log + + try: + operation = delete_instance(compute, project, zone, vm_name) + op_name = operation.get('name', 'unknown') + + # Record the pending deletion for verification + pending = PendingDeletion(vm_name, op_name) + db.add(pending) + if not safe_db_commit(db, f"recording pending deletion for {vm_name}"): + log.warning(f"Failed to record pending deletion for {vm_name}, " + "deletion may not be verified") + + return operation + + except Exception as e: + log.error(f"Failed to initiate deletion for {vm_name}: {e}") + # Still record it so we can retry later + pending = PendingDeletion(vm_name, f"failed-{vm_name}") + db.add(pending) + safe_db_commit(db, f"recording failed deletion attempt for {vm_name}") + raise + + +def check_operation_status(compute, project, zone, operation_name) -> Dict: + """ + Check the status of a GCP operation (non-blocking). + + :param compute: The cloud compute engine service object + :type compute: googleapiclient.discovery.Resource + :param project: The GCP project name + :type project: str + :param zone: Zone for the operation + :type zone: str + :param operation_name: The operation name to check + :type operation_name: str + :return: Operation status dict with 'status' key (RUNNING, DONE, or ERROR) + :rtype: Dict + """ + try: + result = compute.zoneOperations().get( + project=project, + zone=zone, + operation=operation_name).execute() + return result + except Exception as e: + return {'status': 'ERROR', 'error': {'message': str(e)}} + + +def verify_pending_deletions(compute, project, zone, db) -> None: + """ + Verify pending VM deletions and retry failed ones. + + This function checks the status of all pending deletion operations. + If an operation completed successfully, it removes the tracking entry. + If an operation failed, it retries the deletion (up to MAX_RETRIES). + + :param compute: The cloud compute engine service object + :type compute: googleapiclient.discovery.Resource + :param project: The GCP project name + :type project: str + :param zone: Zone for the VM instances + :type zone: str + :param db: Database session + :type db: sqlalchemy.orm.scoping.scoped_session + """ + from run import log + + pending_deletions = PendingDeletion.query.all() + if not pending_deletions: + return + + log.info(f"Verifying {len(pending_deletions)} pending VM deletions") + + for pending in pending_deletions: + vm_name = pending.vm_name + op_name = pending.operation_name + + # Check if this was a failed initial deletion attempt + if op_name.startswith('failed-'): + # Retry the deletion + log.info(f"Retrying failed deletion for {vm_name}") + _retry_deletion(compute, project, zone, vm_name, pending, db, log) + continue + + # Check the operation status + result = check_operation_status(compute, project, zone, op_name) + status = result.get('status', 'UNKNOWN') + + if status == 'DONE': + # Check if operation had errors + if 'error' in result: + error_msg = result['error'].get('errors', [{}])[0].get('message', 'Unknown error') + log.warning(f"Deletion operation for {vm_name} completed with error: {error_msg}") + _retry_deletion(compute, project, zone, vm_name, pending, db, log) + else: + # Success! Remove the tracking entry + log.info(f"VM {vm_name} deletion confirmed (op: {op_name})") + db.delete(pending) + safe_db_commit(db, f"removing confirmed deletion for {vm_name}") + + elif status == 'RUNNING': + # Still in progress - check how long it's been + age_minutes = (datetime.datetime.now() - pending.created_at).total_seconds() / 60 + if age_minutes > 30: + # Operation running too long, might be stuck + log.warning(f"Deletion operation for {vm_name} running for {age_minutes:.0f} min, retrying") + _retry_deletion(compute, project, zone, vm_name, pending, db, log) + else: + log.debug(f"Deletion operation for {vm_name} still running ({age_minutes:.0f} min)") + + else: + # Unknown status or error checking - retry + log.warning(f"Unexpected status '{status}' for {vm_name} deletion, retrying") + _retry_deletion(compute, project, zone, vm_name, pending, db, log) + + +def _retry_deletion(compute, project, zone, vm_name, pending, db, log) -> None: + """ + Retry a failed VM deletion. + + :param compute: The cloud compute engine service object + :param project: The GCP project name + :param zone: Zone for the VM instance + :param vm_name: Name of the VM to delete + :param pending: The PendingDeletion record + :param db: Database session + :param log: Logger instance + """ + if pending.retry_count >= PendingDeletion.MAX_RETRIES: + log.error(f"Max retries ({PendingDeletion.MAX_RETRIES}) exceeded for {vm_name}, " + "VM may still be running and incurring charges!") + # Keep the record so scan_for_orphaned_vms can handle it + return + + pending.retry_count += 1 + log.info(f"Retry {pending.retry_count}/{PendingDeletion.MAX_RETRIES} for {vm_name}") + + try: + operation = delete_instance(compute, project, zone, vm_name) + pending.operation_name = operation.get('name', f'retry-{vm_name}') + pending.created_at = datetime.datetime.now() + safe_db_commit(db, f"updating retry for {vm_name}") + log.info(f"Retry deletion initiated for {vm_name} (op: {pending.operation_name})") + except Exception as e: + error_str = str(e) + if 'notFound' in error_str or '404' in error_str: + # VM doesn't exist anymore - success! + log.info(f"VM {vm_name} no longer exists, deletion confirmed") + db.delete(pending) + safe_db_commit(db, f"removing deletion for non-existent {vm_name}") + else: + log.error(f"Retry deletion failed for {vm_name}: {e}") + safe_db_commit(db, f"recording retry attempt for {vm_name}") + + +def scan_for_orphaned_vms(compute, project, zone, db) -> None: + """ + Scan for orphaned VMs that should have been deleted but weren't. + + This is the final safety net. It finds VMs that: + 1. Match our naming pattern (platform-testid) + 2. Don't have a corresponding GcpInstance record (test finished but VM wasn't deleted) + 3. Have a PendingDeletion record with max retries exceeded + + :param compute: The cloud compute engine service object + :type compute: googleapiclient.discovery.Resource + :param project: The GCP project name + :type project: str + :param zone: Zone for the VM instances + :type zone: str + :param db: Database session + :type db: sqlalchemy.orm.scoping.scoped_session + """ + from run import log + + try: + running_vms = get_running_instances(compute, project, zone) + except Exception as e: + log.error(f"Failed to list running instances: {e}") + return + + if not running_vms: + return + + orphaned_count = 0 + for vm in running_vms: + vm_name = vm['name'] + + # Only check VMs that match our naming pattern + if not is_instance_testing(vm_name): + continue + + # Check if there's a GcpInstance record (meaning test is still running) + gcp_instance = GcpInstance.query.filter(GcpInstance.name == vm_name).first() + if gcp_instance is not None: + # VM is legitimately running + continue + + # No GcpInstance record - this VM should have been deleted + # Check if we're already tracking it + pending = PendingDeletion.query.filter( + PendingDeletion.vm_name == vm_name).first() + + if pending is not None: + if pending.retry_count >= PendingDeletion.MAX_RETRIES: + # Max retries exceeded - force delete + log.warning(f"Orphan scan: Force deleting {vm_name} after max retries") + try: + operation = delete_instance(compute, project, zone, vm_name) + # Reset retry count and update operation + pending.retry_count = 0 + pending.operation_name = operation.get('name', f'force-{vm_name}') + pending.created_at = datetime.datetime.now() + safe_db_commit(db, f"force delete for orphan {vm_name}") + orphaned_count += 1 + except Exception as e: + if 'notFound' in str(e) or '404' in str(e): + db.delete(pending) + safe_db_commit(db, f"removing stale pending for {vm_name}") + else: + log.error(f"Force delete failed for {vm_name}: {e}") + # else: still being handled by verify_pending_deletions + else: + # Not tracked at all - orphaned VM found + log.warning(f"Orphan scan: Found untracked VM {vm_name}, initiating deletion") + try: + operation = delete_instance(compute, project, zone, vm_name) + op_name = operation.get('name', f'orphan-{vm_name}') + # Track it for verification + new_pending = PendingDeletion(vm_name, op_name) + db.add(new_pending) + safe_db_commit(db, f"tracking orphan deletion for {vm_name}") + orphaned_count += 1 + except Exception as e: + if 'notFound' not in str(e) and '404' not in str(e): + log.error(f"Failed to delete orphan VM {vm_name}: {e}") + + if orphaned_count > 0: + log.info(f"Orphan scan: Initiated deletion for {orphaned_count} orphaned VMs") + + def get_config_for_gcp_instance(vm_name, source_disk_image, metadata_items) -> Dict: """ Get VM config for new VM instance. @@ -1972,19 +2247,23 @@ def update_final_status(): log.error(f"Test {test_id}: Failed to update final GitHub status after retries: {e}") if status in [TestStatus.completed, TestStatus.canceled]: - # Delete the current instance (fire-and-forget) - # We intentionally don't wait for the deletion to complete because: + # Delete the current instance with tracking for verification + # We don't wait for the deletion to complete because: # 1. Waiting can take 60+ seconds, exceeding nginx/gunicorn timeouts (502 errors) - # 2. The deletion will complete eventually - we don't need confirmation + # 2. The verify_pending_deletions() cron job will confirm deletion succeeded # 3. All important work (test results, GitHub status) is already done from run import config compute = get_compute_service_object() zone = config.get('ZONE', '') project = config.get('PROJECT_NAME', '') vm_name = f"{test.platform.value}-{test.id}" - operation = delete_instance(compute, project, zone, vm_name) - op_name = operation.get('name', 'unknown') - log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (op: {op_name})") + try: + operation = delete_instance_with_tracking( + compute, project, zone, vm_name, g.db) + op_name = operation.get('name', 'unknown') + log.info(f"[Test: {test_id}] VM deletion initiated for {vm_name} (op: {op_name})") + except Exception as e: + log.error(f"[Test: {test_id}] Failed to delete VM {vm_name}: {e}") # If status is complete, remove the GCP Instance entry if status in [TestStatus.completed, TestStatus.canceled]: diff --git a/mod_ci/models.py b/mod_ci/models.py index 02619a25..1b5c9ebf 100644 --- a/mod_ci/models.py +++ b/mod_ci/models.py @@ -4,6 +4,7 @@ List of models corresponding to mysql tables: [ 'Gcp Instance' => 'gcp_instance', + 'Pending Deletion' => 'pending_deletion', 'Maintenance mode' => 'maintenance_mode' ] """ @@ -79,6 +80,42 @@ def __repr__(self) -> str: return f'' +class PendingDeletion(Base): + """Model to track pending VM deletion operations for verification.""" + + __tablename__ = 'pending_deletion' + __table_args__ = {'mysql_engine': 'InnoDB'} + vm_name = Column(String(64), primary_key=True) + operation_name = Column(String(128), nullable=False) + created_at = Column(DateTime(), nullable=False) + retry_count = Column(Integer, nullable=False, default=0) + + # Max retries before we give up and just try to force delete + MAX_RETRIES = 5 + + def __init__(self, vm_name, operation_name, created_at=None) -> None: + """ + Parametrized constructor for the PendingDeletion model. + + :param vm_name: The name of the VM being deleted + :type vm_name: str + :param operation_name: The GCP operation name/ID for tracking + :type operation_name: str + :param created_at: When the deletion was initiated (None for now) + :type created_at: datetime + """ + self.vm_name = vm_name + self.operation_name = operation_name + if created_at is None: + created_at = datetime.datetime.now() + self.created_at = created_at + self.retry_count = 0 + + def __repr__(self) -> str: + """Represent a PendingDeletion by its vm_name.""" + return f'' + + class MaintenanceMode(Base): """Model to maintain maintenance status of platforms.""" diff --git a/tests/test_ci/test_controllers.py b/tests/test_ci/test_controllers.py index ac48ed0a..f1c4ba74 100644 --- a/tests/test_ci/test_controllers.py +++ b/tests/test_ci/test_controllers.py @@ -3546,3 +3546,255 @@ def test_permanent_failure_marks_test_failed( mock_mark_failed.assert_called_once() # Should log critical mock_log.critical.assert_called() + + +class TestPendingDeletionTracking(BaseTestCase): + """Tests for VM deletion tracking and verification functionality.""" + + @mock.patch('run.log') + def test_check_operation_status_success(self, mock_log): + """Test check_operation_status returns status correctly.""" + from mod_ci.controllers import check_operation_status + + compute = MagicMock() + zone_ops = compute.zoneOperations.return_value.get.return_value + zone_ops.execute.return_value = {'status': 'DONE'} + + result = check_operation_status(compute, "project", "zone", "op-123") + + self.assertEqual(result['status'], 'DONE') + + @mock.patch('run.log') + def test_check_operation_status_error(self, mock_log): + """Test check_operation_status handles API errors.""" + from mod_ci.controllers import check_operation_status + + compute = MagicMock() + zone_ops = compute.zoneOperations.return_value.get.return_value + zone_ops.execute.side_effect = Exception("API Error") + + result = check_operation_status(compute, "project", "zone", "op-123") + + self.assertEqual(result['status'], 'ERROR') + self.assertIn('error', result) + + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('mod_ci.controllers.delete_instance') + @mock.patch('run.log') + def test_delete_instance_with_tracking_success( + self, mock_log, mock_delete, mock_commit): + """Test delete_instance_with_tracking records pending deletion.""" + from mod_ci.controllers import delete_instance_with_tracking + from mod_ci.models import PendingDeletion + + mock_delete.return_value = {'name': 'op-123'} + mock_commit.return_value = True + + db = MagicMock() + result = delete_instance_with_tracking( + MagicMock(), "project", "zone", "linux-42", db) + + self.assertEqual(result['name'], 'op-123') + # Verify PendingDeletion was added + db.add.assert_called_once() + added_obj = db.add.call_args[0][0] + self.assertIsInstance(added_obj, PendingDeletion) + self.assertEqual(added_obj.vm_name, 'linux-42') + self.assertEqual(added_obj.operation_name, 'op-123') + + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('mod_ci.controllers.delete_instance') + @mock.patch('run.log') + def test_delete_instance_with_tracking_failure( + self, mock_log, mock_delete, mock_commit): + """Test delete_instance_with_tracking handles failures.""" + from mod_ci.controllers import delete_instance_with_tracking + from mod_ci.models import PendingDeletion + + mock_delete.side_effect = Exception("GCP Error") + mock_commit.return_value = True + + db = MagicMock() + with self.assertRaises(Exception): + delete_instance_with_tracking( + MagicMock(), "project", "zone", "linux-42", db) + + # Should still record the failed attempt + db.add.assert_called_once() + added_obj = db.add.call_args[0][0] + self.assertIsInstance(added_obj, PendingDeletion) + self.assertTrue(added_obj.operation_name.startswith('failed-')) + + @mock.patch('mod_ci.controllers.PendingDeletion') + @mock.patch('mod_ci.controllers.check_operation_status') + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('run.log') + def test_verify_pending_deletions_success( + self, mock_log, mock_commit, mock_check, mock_pending_class): + """Test verify_pending_deletions removes successful deletions.""" + from mod_ci.controllers import verify_pending_deletions + + # Create mock pending deletion + mock_pending = MagicMock() + mock_pending.vm_name = 'linux-42' + mock_pending.operation_name = 'op-123' + mock_pending.retry_count = 0 + mock_pending_class.query.all.return_value = [mock_pending] + + # Operation completed successfully + mock_check.return_value = {'status': 'DONE'} + mock_commit.return_value = True + + db = MagicMock() + verify_pending_deletions(MagicMock(), "project", "zone", db) + + # Should delete the pending record + db.delete.assert_called_once_with(mock_pending) + mock_commit.assert_called() + + @mock.patch('mod_ci.controllers.PendingDeletion') + @mock.patch('mod_ci.controllers.check_operation_status') + @mock.patch('mod_ci.controllers._retry_deletion') + @mock.patch('run.log') + def test_verify_pending_deletions_retries_on_error( + self, mock_log, mock_retry, mock_check, mock_pending_class): + """Test verify_pending_deletions retries failed deletions.""" + from mod_ci.controllers import verify_pending_deletions + + # Create mock pending deletion + mock_pending = MagicMock() + mock_pending.vm_name = 'linux-42' + mock_pending.operation_name = 'op-123' + mock_pending.retry_count = 0 + mock_pending_class.query.all.return_value = [mock_pending] + + # Operation failed + mock_check.return_value = { + 'status': 'DONE', + 'error': {'errors': [{'message': 'Permission denied'}]} + } + + db = MagicMock() + verify_pending_deletions(MagicMock(), "project", "zone", db) + + # Should retry + mock_retry.assert_called_once() + + @mock.patch('mod_ci.controllers.GcpInstance') + @mock.patch('mod_ci.controllers.get_running_instances') + @mock.patch('mod_ci.controllers.is_instance_testing') + @mock.patch('mod_ci.controllers.delete_instance') + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('run.log') + def test_scan_for_orphaned_vms_finds_orphan( + self, mock_log, mock_commit, mock_delete, mock_is_testing, + mock_get_running, mock_gcp_class): + """Test scan_for_orphaned_vms finds and deletes orphaned VMs.""" + from mod_ci.controllers import scan_for_orphaned_vms + from mod_ci.models import PendingDeletion + + # Mock running VM + mock_get_running.return_value = [{'name': 'linux-42'}] + mock_is_testing.return_value = True + + # No GcpInstance record (orphaned) + mock_gcp_class.query.filter.return_value.first.return_value = None + # Not tracked in PendingDeletion - use real query + with mock.patch.object(PendingDeletion, 'query') as mock_query: + mock_query.filter.return_value.first.return_value = None + + mock_delete.return_value = {'name': 'op-orphan-42'} + mock_commit.return_value = True + + db = MagicMock() + scan_for_orphaned_vms(MagicMock(), "project", "zone", db) + + # Should delete the orphan + mock_delete.assert_called_once() + # Should track it + db.add.assert_called_once() + added_obj = db.add.call_args[0][0] + self.assertIsInstance(added_obj, PendingDeletion) + self.assertEqual(added_obj.vm_name, 'linux-42') + + @mock.patch('mod_ci.controllers.PendingDeletion') + @mock.patch('mod_ci.controllers.GcpInstance') + @mock.patch('mod_ci.controllers.get_running_instances') + @mock.patch('mod_ci.controllers.is_instance_testing') + @mock.patch('run.log') + def test_scan_for_orphaned_vms_ignores_active( + self, mock_log, mock_is_testing, mock_get_running, + mock_gcp_class, mock_pending_class): + """Test scan_for_orphaned_vms ignores VMs with GcpInstance records.""" + from mod_ci.controllers import scan_for_orphaned_vms + + # Mock running VM + mock_get_running.return_value = [{'name': 'linux-42'}] + mock_is_testing.return_value = True + + # Has GcpInstance record (active test) + mock_gcp_class.query.filter.return_value.first.return_value = MagicMock() + + db = MagicMock() + scan_for_orphaned_vms(MagicMock(), "project", "zone", db) + + # Should NOT track or delete + db.add.assert_not_called() + + @mock.patch('mod_ci.controllers.delete_instance') + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('run.log') + def test_retry_deletion_success(self, mock_log, mock_commit, mock_delete): + """Test _retry_deletion successfully retries.""" + from mod_ci.controllers import _retry_deletion + from mod_ci.models import PendingDeletion + + mock_delete.return_value = {'name': 'op-retry-123'} + mock_commit.return_value = True + + pending = PendingDeletion('linux-42', 'op-failed') + pending.retry_count = 0 + + db = MagicMock() + _retry_deletion(MagicMock(), "project", "zone", "linux-42", pending, db, mock_log) + + mock_delete.assert_called_once() + self.assertEqual(pending.retry_count, 1) + self.assertEqual(pending.operation_name, 'op-retry-123') + + @mock.patch('mod_ci.controllers.delete_instance') + @mock.patch('run.log') + def test_retry_deletion_max_retries(self, mock_log, mock_delete): + """Test _retry_deletion stops after max retries.""" + from mod_ci.controllers import _retry_deletion + from mod_ci.models import PendingDeletion + + pending = PendingDeletion('linux-42', 'op-failed') + pending.retry_count = PendingDeletion.MAX_RETRIES + + db = MagicMock() + _retry_deletion(MagicMock(), "project", "zone", "linux-42", pending, db, mock_log) + + # Should NOT attempt delete + mock_delete.assert_not_called() + mock_log.error.assert_called() + + @mock.patch('mod_ci.controllers.delete_instance') + @mock.patch('mod_ci.controllers.safe_db_commit') + @mock.patch('run.log') + def test_retry_deletion_vm_not_found(self, mock_log, mock_commit, mock_delete): + """Test _retry_deletion handles 404 (VM already deleted).""" + from mod_ci.controllers import _retry_deletion + from mod_ci.models import PendingDeletion + + mock_delete.side_effect = Exception("notFound: VM not found") + mock_commit.return_value = True + + pending = PendingDeletion('linux-42', 'op-failed') + pending.retry_count = 0 + + db = MagicMock() + _retry_deletion(MagicMock(), "project", "zone", "linux-42", pending, db, mock_log) + + # Should remove the pending record (VM is gone) + db.delete.assert_called_once_with(pending)