From bc4c05e9f2351c5b3d356f72ecbce169447c38d6 Mon Sep 17 00:00:00 2001 From: Jonathan Gangi Date: Mon, 6 Oct 2025 17:03:12 -0300 Subject: [PATCH 1/3] Azure: rework how conflict detectino works This commit changes the existing code of `AzureService.ensure_can_publish` to behave as follows: 1. It no longer retries on error but simply raises whenever a conflict is detected 2. It no longer searches only for "live" or "preview" but for all existing submissions 3. It improves the logging mechanism to inform which state the submission was and its target. Rationale: 1. If we simply wait for the publishing to finish as the previous version (using retry) we could end up in a scenario where the changes made by the library are no longer the latest, causing the Graph API to return the error: `The submission cannot be pushed to as its not the latest` With that, we would need to retry the whole `publish` operations so it makes more sense to just raise `ConflictError` and let `pubtools` retry when this error is caught. 2. The previous code were not able to catch a lot of conflicts, resulting in many errors with `An In Progress submission already exists` By looking into each submission target we may caught conflicts early. 3. Logs will now show exactly what was the reason it was already in progress and for which submission target. 4. A retry can/will be implemented on `pubtools-marketplacesvm` on Azure `push` in case a `ConflictError` is detected. 5. Replaces !119 Refers to SPSTRAT-611 and SPSTRAT-549 Signed-off-by: Jonathan Gangi --- cloudpub/error.py | 4 + cloudpub/ms_azure/service.py | 25 ++--- tests/ms_azure/test_service.py | 183 +++++++++++++++++---------------- 3 files changed, 112 insertions(+), 100 deletions(-) diff --git a/cloudpub/error.py b/cloudpub/error.py index b3dd908..70d9aef 100644 --- a/cloudpub/error.py +++ b/cloudpub/error.py @@ -21,5 +21,9 @@ class NotFoundError(ValueError): """Represent a missing resource.""" +class ConflictError(RuntimeError): + """Report a submission conflict error.""" + + class Timeout(Exception): """Represent a missing resource.""" diff --git a/cloudpub/ms_azure/service.py b/cloudpub/ms_azure/service.py index 46d4434..6f5f114 100644 --- a/cloudpub/ms_azure/service.py +++ b/cloudpub/ms_azure/service.py @@ -12,7 +12,7 @@ from tenacity.wait import wait_chain, wait_fixed from cloudpub.common import BaseService -from cloudpub.error import InvalidStateError, NotFoundError +from cloudpub.error import ConflictError, InvalidStateError, NotFoundError from cloudpub.models.ms_azure import ( RESOURCE_MAPING, AzureResource, @@ -467,31 +467,28 @@ def submit_to_status( log.debug("Set the status \"%s\" to submission.", status) return self.configure(resources=cfg_res) - @retry( - wait=wait_fixed(300), - stop=stop_after_delay(max_delay=60 * 60 * 24 * 7), # Give up after retrying for 7 days, - reraise=True, - ) def ensure_can_publish(self, product_id: str) -> None: """ Ensure the offer is not already being published. - It will wait for up to 7 days retrying to make sure it's possible to publish before - giving up and raising. + It will raise ConflictError if a publish is already in progress in any submission target. Args: product_id (str) The product ID to check the offer's publishing status Raises: - RuntimeError: whenever a publishing is already in progress. + ConflictError: whenever a publishing is already in progress for any submission target. """ log.info("Ensuring no other publishing jobs are in progress for \"%s\"", product_id) - submission_targets = ["preview", "live"] - for target in submission_targets: - sub = self.get_submission_state(product_id, state=target) - if sub and sub.status and sub.status == "running": - raise RuntimeError(f"The offer {product_id} is already being published to {target}") + for sub in self.get_submissions(product_id): + if sub and sub.status and sub.status != "completed": + msg = ( + f"The offer {product_id} is already being published to " + f"{sub.target.targetType}: {sub.status}/{sub.result}" + ) + log.error(msg) + raise ConflictError(msg) def get_plan_tech_config(self, product: Product, plan: PlanSummary) -> VMIPlanTechConfig: """ diff --git a/tests/ms_azure/test_service.py b/tests/ms_azure/test_service.py index e578d19..b11069f 100644 --- a/tests/ms_azure/test_service.py +++ b/tests/ms_azure/test_service.py @@ -10,10 +10,9 @@ from httmock import response from requests import Response from requests.exceptions import HTTPError -from tenacity.stop import stop_after_attempt from cloudpub.common import BaseService -from cloudpub.error import InvalidStateError, NotFoundError +from cloudpub.error import ConflictError, InvalidStateError, NotFoundError from cloudpub.models.ms_azure import ( ConfigureStatus, CustomerLeads, @@ -750,85 +749,35 @@ def test_submit_to_status_not_found( mock_configure.assert_not_called() - @pytest.mark.parametrize("target", ["preview", "live"]) - @mock.patch("cloudpub.ms_azure.AzureService.get_submission_state") + @mock.patch("cloudpub.ms_azure.AzureService.get_submissions") def test_ensure_can_publish_success( self, - mock_getsubst: mock.MagicMock, - target: str, + mock_getsubs: mock.MagicMock, azure_service: AzureService, ) -> None: - submission = { - "$schema": "https://product-ingestion.azureedge.net/schema/submission/2022-03-01-preview2", # noqa: E501 - "id": "submission/ffffffff-ffff-ffff-ffff-ffffffffffff/0", - "product": "product/ffffffff-ffff-ffff-ffff-ffffffffffff", - "target": {"targetType": target}, - "lifecycleState": "generallyAvailable", - "status": "completed", - "result": "succeeded", - "created": "2024-07-04T22:06:16.2895521Z", - } - mock_getsubst.return_value = ProductSubmission.from_json(submission) - azure_service.ensure_can_publish.retry.sleep = mock.MagicMock() # type: ignore - azure_service.ensure_can_publish.retry.stop = stop_after_attempt(1) # type: ignore - - azure_service.ensure_can_publish("ffffffff-ffff-ffff-ffff-ffffffffffff") - - # All targets are called by the method, it should pass all - mock_getsubst.assert_has_calls( - [ - mock.call("ffffffff-ffff-ffff-ffff-ffffffffffff", state="preview"), - mock.call("ffffffff-ffff-ffff-ffff-ffffffffffff", state="live"), - ] - ) - - @pytest.mark.parametrize("target", ["preview", "live"]) - @mock.patch("cloudpub.ms_azure.AzureService.get_submission_state") - def test_ensure_can_publish_success_after_retry( - self, - mock_getsubst: mock.MagicMock, - target: str, - azure_service: AzureService, - ) -> None: - running = { - "$schema": "https://product-ingestion.azureedge.net/schema/submission/2022-03-01-preview2", # noqa: E501 - "id": "submission/ffffffff-ffff-ffff-ffff-ffffffffffff/0", - "product": "product/ffffffff-ffff-ffff-ffff-ffffffffffff", - "target": {"targetType": target}, - "lifecycleState": "generallyAvailable", - "status": "running", - "result": "pending", - "created": "2024-07-04T22:06:16.2895521Z", - } - complete = { - "$schema": "https://product-ingestion.azureedge.net/schema/submission/2022-03-01-preview2", # noqa: E501 - "id": "submission/ffffffff-ffff-ffff-ffff-ffffffffffff/0", - "product": "product/ffffffff-ffff-ffff-ffff-ffffffffffff", - "target": {"targetType": target}, - "lifecycleState": "generallyAvailable", - "status": "completed", - "result": "succeeded", - "created": "2024-07-04T22:06:16.2895521Z", - } - mock_getsubst.side_effect = [ - ProductSubmission.from_json(running), - ProductSubmission.from_json(running), - ProductSubmission.from_json(complete), - ProductSubmission.from_json(complete), + submissions = [ + { + "$schema": "https://product-ingestion.azureedge.net/schema/submission/2022-03-01-preview2", # noqa: E501 + "id": "submission/ffffffff-ffff-ffff-ffff-ffffffffffff/0", + "product": "product/ffffffff-ffff-ffff-ffff-ffffffffffff", + "target": {"targetType": tgt}, + "lifecycleState": "generallyAvailable", + "status": "completed", + "result": "succeeded", + "created": "2024-07-04T22:06:16.2895521Z", + } + for tgt in ["draft", "preview", "live"] ] - azure_service.ensure_can_publish.retry.sleep = mock.MagicMock() # type: ignore - azure_service.ensure_can_publish.retry.stop = stop_after_attempt(3) # type: ignore + mock_getsubs.return_value = [ProductSubmission.from_json(s) for s in submissions] azure_service.ensure_can_publish("ffffffff-ffff-ffff-ffff-ffffffffffff") - - # Calls for "live" and "preview" for 2 times before success == 4 - assert mock_getsubst.call_count == 4 + mock_getsubs.assert_called_once() @pytest.mark.parametrize("target", ["preview", "live"]) - @mock.patch("cloudpub.ms_azure.AzureService.get_submission_state") + @mock.patch("cloudpub.ms_azure.AzureService.get_submissions") def test_ensure_can_publish_raises( self, - mock_getsubst: mock.MagicMock, + mock_getsubs: mock.MagicMock, target: str, azure_service: AzureService, ) -> None: @@ -856,17 +805,13 @@ def test_ensure_can_publish_raises( "result": "pending", "created": "2024-07-04T22:06:16.2895521Z", } - if target == "preview": - subs = [ProductSubmission.from_json(sub2), ProductSubmission.from_json(sub1)] - else: - subs = [ProductSubmission.from_json(sub1), ProductSubmission.from_json(sub2)] - mock_getsubst.side_effect = subs + subs = [ProductSubmission.from_json(sub1), ProductSubmission.from_json(sub2)] + mock_getsubs.return_value = subs err = ( - f"The offer ffffffff-ffff-ffff-ffff-ffffffffffff is already being published to {target}" + "The offer ffffffff-ffff-ffff-ffff-ffffffffffff is already being published to " + f"{target}: running/pending" ) - azure_service.ensure_can_publish.retry.sleep = mock.MagicMock() # type: ignore - azure_service.ensure_can_publish.retry.stop = stop_after_attempt(1) # type: ignore with pytest.raises(RuntimeError, match=err): azure_service.ensure_can_publish("ffffffff-ffff-ffff-ffff-ffffffffffff") @@ -1002,6 +947,74 @@ def test_publish_live_fail_on_retry( with pytest.raises(RuntimeError, match=expected_err): azure_service._publish_live(product_obj, "test-product") + @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") + @mock.patch("cloudpub.ms_azure.AzureService.get_productid") + @mock.patch("cloudpub.ms_azure.AzureService.configure") + def test_publish_live_fail_conflict( + self, + mock_configure: mock.MagicMock, + mock_get_productid: mock.MagicMock, + mock_compute_targets: mock.MagicMock, + token: Dict[str, Any], + auth_dict: Dict[str, Any], + configure_success_response: Dict[str, Any], + product: Dict[str, Any], + products_list: Dict[str, Any], + product_summary: Dict[str, Any], + technical_config: Dict[str, Any], + submission: Dict[str, Any], + product_summary_obj: ProductSummary, + plan_summary_obj: PlanSummary, + metadata_azure_obj: mock.MagicMock, + gen2_image: Dict[str, Any], + caplog: pytest.LogCaptureFixture, + ) -> None: + """Ensure operation is aborted when a ConflictError occurs.""" + # Prepare testing data + metadata_azure_obj.keepdraft = False + metadata_azure_obj.destination = "example-product/plan-1" + metadata_azure_obj.modular_push = True + mock_get_productid.return_value = "fake-id" + targets = ["preview", "live", "draft"] + mock_compute_targets.return_value = targets + + # Set the submission states with conflict on preview + submission_preview = deepcopy(submission) + submission_preview.update( + {"target": {"targetType": "preview"}, "status": "running", "result": "pending"} + ) + submission_live = deepcopy(submission) + submission_live.update({"target": {"targetType": "live"}}) + mock_configure.return_value = ConfigureStatus.from_json(configure_success_response) + + # Expected error + err = ( + "The offer ffffffff-ffff-ffff-ffff-ffffffffffff is already being published" + " to preview: running/pending" + ) + + # Constants + login_url = "https://login.microsoftonline.com/foo/oauth2/token" + base_url = "https://graph.microsoft.com/rp/product-ingestion" + product_id = str(product_summary['id']).split("/")[-1] + + # Test + with caplog.at_level(logging.INFO): + with requests_mock.Mocker() as m: + m.post(login_url, json=token) + m.get(f"{base_url}/product", json=products_list) + m.get(f"{base_url}/resource-tree/product/{product_id}", json=product) + m.get( + f"{base_url}/submission/{product_id}", + [ + {"json": {"value": [submission, submission_preview, submission_live]}}, + ], + ) + azure_svc = AzureService(auth_dict) + + with pytest.raises(ConflictError, match=err): + azure_svc.publish(metadata=metadata_azure_obj) + @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1651,12 +1664,14 @@ def test_publish_live_arm64_only( mock_submit.assert_has_calls(submit_calls) mock_ensure_publish.assert_called_once_with(product_obj.id) + @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") def test_publish_live_when_state_is_preview( self, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_ensure_publish: mock.MagicMock, token: Dict[str, Any], auth_dict: Dict[str, Any], configure_running_response: Dict[str, Any], @@ -1714,8 +1729,6 @@ def test_publish_live_when_state_is_preview( m.get( f"{base_url}/submission/{product_id}", [ - {"json": submissions_inprog}, # ensure_can_publish call "preview" - {"json": submissions_inprog}, # ensure_can_publish call "live" {"json": submissions_inprog}, # _is_submission_in_preview call {"json": submissions_inprog}, # submit_to_status check prev_state call {"json": submissions_final}, # submit_to_status validation after configure @@ -1749,10 +1762,6 @@ def test_publish_live_when_state_is_preview( 'Requesting the product ID "ffffffff-ffff-ffff-ffff-ffffffffffff" with state "preview".' in caplog.text ) - assert ( - 'Ensuring no other publishing jobs are in progress for "ffffffff-ffff-ffff-ffff-ffffffffffff"' # noqa: E501 - in caplog.text - ) assert ( 'Looking up for submission in state "preview" for "ffffffff-ffff-ffff-ffff-ffffffffffff"' # noqa: E501 in caplog.text @@ -1782,7 +1791,9 @@ def test_publish_live_when_state_is_preview( 'Updating the technical configuration for "example-product/plan-1" on "preview".' not in caplog.text ) + mock_ensure_publish.assert_called_once() + @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1791,6 +1802,7 @@ def test_publish_live_modular_push( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_ensure_publish: mock.MagicMock, token: Dict[str, Any], auth_dict: Dict[str, Any], configure_success_response: Dict[str, Any], @@ -1861,8 +1873,6 @@ def test_publish_live_modular_push( m.get( f"{base_url}/submission/{product_id}", [ - {"json": {"value": [submission]}}, # ensure_can_publish call "preview" - {"json": {"value": [submission]}}, # ensure_can_publish call "live" {"json": {"value": [submission]}}, # push_preview: call submit_status {"json": {"value": [submission_preview]}}, # push_preview: check result {"json": {"value": [submission_preview]}}, # push_live: call submit_status @@ -1886,6 +1896,7 @@ def test_publish_live_modular_push( 'Performing a modular push to "preview" for "ffffffff-ffff-ffff-ffff-ffffffffffff"' in caplog.text ) + mock_ensure_publish.assert_called_once() # Configure request mock_configure.assert_has_calls( From e09bd83267fec36863dd4cf6a85ff7a963698db5 Mon Sep 17 00:00:00 2001 From: Jonathan Gangi Date: Tue, 7 Oct 2025 15:18:58 -0300 Subject: [PATCH 2/3] Azure: wait for active publishing before executing THis commit creates a new method for `AzureService` named `wait_active_publishing` which relies on `ensure_can_publish` to verify whether changes are being made or not and wait until it's possible to proceed or timeout if necessary. The timeout and retry interval are now possible to be set during the class construction as optional parameters. With this the `publish` method will double check before doing any changes: 1. Before loading the product information it will wait for active changes so it can retrieve the "latest" content 2. During the publishing phase, if it's no longer the latest change it will raise from `ensure_can_publish` Refers to SPSTRAT-611 and SPSTRAT-549 Signed-off-by: Jonathan Gangi --- cloudpub/ms_azure/service.py | 42 ++++++++++++++++++++--- tests/ms_azure/conftest.py | 2 +- tests/ms_azure/test_service.py | 63 +++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 6 deletions(-) diff --git a/cloudpub/ms_azure/service.py b/cloudpub/ms_azure/service.py index 6f5f114..fb22d3a 100644 --- a/cloudpub/ms_azure/service.py +++ b/cloudpub/ms_azure/service.py @@ -6,13 +6,13 @@ from deepdiff import DeepDiff from requests import HTTPError -from tenacity import retry -from tenacity.retry import retry_if_result +from tenacity import RetryError, Retrying, retry +from tenacity.retry import retry_if_exception_type, retry_if_result from tenacity.stop import stop_after_attempt, stop_after_delay from tenacity.wait import wait_chain, wait_fixed from cloudpub.common import BaseService -from cloudpub.error import ConflictError, InvalidStateError, NotFoundError +from cloudpub.error import ConflictError, InvalidStateError, NotFoundError, Timeout from cloudpub.models.ms_azure import ( RESOURCE_MAPING, AzureResource, @@ -82,18 +82,31 @@ class AzureService(BaseService[AzurePublishingMetadata]): CONFIGURE_SCHEMA = "https://schema.mp.microsoft.com/schema/configure/{AZURE_API_VERSION}" DIFF_EXCLUDES = [r"root\['resources'\]\[[0-9]+\]\['url'\]"] - def __init__(self, credentials: Dict[str, str]): + def __init__( + self, + credentials: Dict[str, str], + retry_interval: Union[int, float] = 300, + retry_timeout: Union[int, float] = 3600 * 24 * 7, + ): """ Create a new AuzureService object. Args: credentials (dict) Dictionary with Azure credentials to authenticate on Product Ingestion API. + retry_interval (int, float) + The wait time interval in seconds for retrying jobs. + Defaults to 300 + retry_timeout (int, float) + The max time in seconds to attempt retries. + Defaults to 7 days. """ self.session = PartnerPortalSession.make_graph_api_session( auth_keys=credentials, schema_version=self.AZURE_SCHEMA_VERSION ) self._products: List[ProductSummary] = [] + self.retry_interval = retry_interval + self.retry_timeout = retry_timeout def _configure(self, data: Dict[str, Any]) -> ConfigureStatus: """ @@ -490,6 +503,26 @@ def ensure_can_publish(self, product_id: str) -> None: log.error(msg) raise ConflictError(msg) + def wait_active_publishing(self, product_id: str) -> None: + """ + Wait when there's an existing submission in progress. + + Args: + product_id (str) + The product ID of to verify the submissions state. + """ + r = Retrying( + retry=retry_if_exception_type(ConflictError), + wait=wait_fixed(self.retry_interval), + stop=stop_after_delay(max_delay=self.retry_timeout), + ) + log.info("Checking for active changes on %s.", product_id) + + try: + r(self.ensure_can_publish, product_id) + except RetryError: + self._raise_error(Timeout, f"Timed out waiting for {product_id} to be unlocked") + def get_plan_tech_config(self, product: Product, plan: PlanSummary) -> VMIPlanTechConfig: """ Return the VMIPlanTechConfig resource for the given product/plan. @@ -815,6 +848,7 @@ def publish(self, metadata: AzurePublishingMetadata) -> None: plan_name = metadata.destination.split("/")[-1] product_id = self.get_productid(product_name) disk_version = None + self.wait_active_publishing(product_id=product_id) log.info( "Preparing to associate the image \"%s\" with the plan \"%s\" from product \"%s\"", metadata.image_path, diff --git a/tests/ms_azure/conftest.py b/tests/ms_azure/conftest.py index f24f86a..8121028 100644 --- a/tests/ms_azure/conftest.py +++ b/tests/ms_azure/conftest.py @@ -56,7 +56,7 @@ def auth_dict() -> Dict[str, str]: @mock.patch("cloudpub.ms_azure.service.PartnerPortalSession") def azure_service(auth_dict: Dict[str, str]) -> AzureService: """Return an instance of AzureService with mocked PartnerPortalSession.""" - return AzureService(auth_dict) + return AzureService(auth_dict, retry_interval=0, retry_timeout=10) def job_details(status: str, result: str, errors: List[Dict[str, Any]]) -> Dict[str, Any]: diff --git a/tests/ms_azure/test_service.py b/tests/ms_azure/test_service.py index b11069f..252ca25 100644 --- a/tests/ms_azure/test_service.py +++ b/tests/ms_azure/test_service.py @@ -12,7 +12,7 @@ from requests.exceptions import HTTPError from cloudpub.common import BaseService -from cloudpub.error import ConflictError, InvalidStateError, NotFoundError +from cloudpub.error import ConflictError, InvalidStateError, NotFoundError, Timeout from cloudpub.models.ms_azure import ( ConfigureStatus, CustomerLeads, @@ -816,6 +816,37 @@ def test_ensure_can_publish_raises( with pytest.raises(RuntimeError, match=err): azure_service.ensure_can_publish("ffffffff-ffff-ffff-ffff-ffffffffffff") + @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") + def test_wait_active_publishing_success( + self, mock_ensure_publish: mock.MagicMock, azure_service: AzureService + ): + # The test will simlulate 3 submissoins in progress to wait for + mock_ensure_publish.side_effect = [ + ConflictError("Submission in progress"), + ConflictError("Submission in progress"), + ConflictError("Submission in progress"), + None, + ] + + # Test + azure_service.wait_active_publishing("fake-product") + mock_ensure_publish.assert_has_calls([mock.call("fake-product") for _ in range(4)]) + + @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") + def test_wait_active_publishing_timeout( + self, mock_ensure_publish: mock.MagicMock, azure_service: AzureService + ) -> None: + mock_ensure_publish.side_effect = [ + ConflictError("Submission in progress") for _ in range(15) + ] + err = "Timed out waiting for fake-product to be unlocked" + azure_service.retry_interval = 0.1 + azure_service.retry_timeout = 0.5 + + # Test + with pytest.raises(Timeout, match=err): + azure_service.wait_active_publishing("fake-product") + @mock.patch("cloudpub.ms_azure.AzureService.get_submission_state") @mock.patch("cloudpub.ms_azure.AzureService.submit_to_status") @mock.patch("cloudpub.ms_azure.AzureService._is_submission_in_preview") @@ -947,6 +978,7 @@ def test_publish_live_fail_on_retry( with pytest.raises(RuntimeError, match=expected_err): azure_service._publish_live(product_obj, "test-product") + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -955,6 +987,7 @@ def test_publish_live_fail_conflict( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, token: Dict[str, Any], auth_dict: Dict[str, Any], configure_success_response: Dict[str, Any], @@ -1014,7 +1047,9 @@ def test_publish_live_fail_conflict( with pytest.raises(ConflictError, match=err): azure_svc.publish(metadata=metadata_azure_obj) + mock_wait_publish.assert_called_once() + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1037,6 +1072,7 @@ def test_publish_overwrite( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, product_obj: Product, plan_summary_obj: PlanSummary, metadata_azure_obj: AzurePublishingMetadata, @@ -1063,6 +1099,7 @@ def test_publish_overwrite( azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_called_once_with("example-product", "plan-1", 'draft') mock_filter.assert_called_once_with( product=product_obj, resource="virtual-machine-plan-technical-configuration" @@ -1079,6 +1116,7 @@ def test_publish_overwrite( mock_configure.assert_called_once_with(resources=[technical_config_obj]) mock_submit.assert_not_called() + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1101,6 +1139,7 @@ def test_publish_nodiskversion( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, product_obj: Product, plan_summary_obj: PlanSummary, metadata_azure_obj: AzurePublishingMetadata, @@ -1135,6 +1174,7 @@ def test_publish_nodiskversion( azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_has_calls( [mock.call("example-product", "plan-1", tgt) for tgt in targets] ) @@ -1164,6 +1204,7 @@ def test_publish_nodiskversion( mock_submit.assert_not_called() @pytest.mark.parametrize("keepdraft", [True, False], ids=["nochannel", "push"]) + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1188,6 +1229,7 @@ def test_publish_saspresent( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, keepdraft: bool, product_obj: Product, plan_summary_obj: PlanSummary, @@ -1211,6 +1253,7 @@ def test_publish_saspresent( azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_called_once_with("example-product", "plan-1", "preview") mock_filter.assert_has_calls( [ @@ -1230,6 +1273,7 @@ def test_publish_saspresent( mock_configure.assert_not_called() mock_submit.assert_not_called() + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1250,6 +1294,7 @@ def test_publish_novmimages( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, product_obj: Product, plan_summary_obj: PlanSummary, metadata_azure_obj: AzurePublishingMetadata, @@ -1291,6 +1336,7 @@ def test_publish_novmimages( azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_has_calls( [mock.call("example-product", "plan-1", tgt) for tgt in targets] ) @@ -1317,6 +1363,7 @@ def test_publish_novmimages( mock_configure.assert_called_once_with(resources=[expected_tech_config]) mock_submit.assert_not_called() + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.configure") @@ -1337,6 +1384,7 @@ def test_publish_disk_has_images( mock_configure: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, product_obj: Product, plan_summary_obj: PlanSummary, metadata_azure_obj: AzurePublishingMetadata, @@ -1377,6 +1425,7 @@ def test_publish_disk_has_images( azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_has_calls( [mock.call("example-product", "plan-1", tgt) for tgt in targets] ) @@ -1447,6 +1496,7 @@ def test_is_submission_in_preview( assert res is True mock_substt.assert_called_once_with(current.product_id, "live") + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") @@ -1473,6 +1523,7 @@ def test_publish_live_x64_only( mock_ensure_publish: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, product_obj: Product, plan_summary_obj: PlanSummary, metadata_azure_obj: AzurePublishingMetadata, @@ -1521,6 +1572,7 @@ def test_publish_live_x64_only( # Test azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_has_calls( [mock.call("example-product", "plan-1", tgt) for tgt in targets] ) @@ -1555,6 +1607,7 @@ def test_publish_live_x64_only( mock_submit.assert_has_calls(submit_calls) mock_ensure_publish.assert_called_once_with(product_obj.id) + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") @@ -1581,6 +1634,7 @@ def test_publish_live_arm64_only( mock_ensure_publish: mock.MagicMock, mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, + mock_wait_publish: mock.MagicMock, product_obj: Product, plan_summary_obj: PlanSummary, metadata_azure_obj: AzurePublishingMetadata, @@ -1630,6 +1684,7 @@ def test_publish_live_arm64_only( # Test azure_service.publish(metadata_azure_obj) + mock_wait_publish.assert_called_once() mock_getprpl_name.assert_has_calls( [mock.call("example-product", "plan-1", tgt) for tgt in targets] ) @@ -1664,6 +1719,7 @@ def test_publish_live_arm64_only( mock_submit.assert_has_calls(submit_calls) mock_ensure_publish.assert_called_once_with(product_obj.id) + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @@ -1672,6 +1728,7 @@ def test_publish_live_when_state_is_preview( mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, mock_ensure_publish: mock.MagicMock, + mock_wait_publish: mock.MagicMock, token: Dict[str, Any], auth_dict: Dict[str, Any], configure_running_response: Dict[str, Any], @@ -1791,8 +1848,10 @@ def test_publish_live_when_state_is_preview( 'Updating the technical configuration for "example-product/plan-1" on "preview".' not in caplog.text ) + mock_wait_publish.assert_called_once() mock_ensure_publish.assert_called_once() + @mock.patch("cloudpub.ms_azure.AzureService.wait_active_publishing") @mock.patch("cloudpub.ms_azure.AzureService.ensure_can_publish") @mock.patch("cloudpub.ms_azure.AzureService.compute_targets") @mock.patch("cloudpub.ms_azure.AzureService.get_productid") @@ -1803,6 +1862,7 @@ def test_publish_live_modular_push( mock_get_productid: mock.MagicMock, mock_compute_targets: mock.MagicMock, mock_ensure_publish: mock.MagicMock, + mock_wait_publish: mock.MagicMock, token: Dict[str, Any], auth_dict: Dict[str, Any], configure_success_response: Dict[str, Any], @@ -1896,6 +1956,7 @@ def test_publish_live_modular_push( 'Performing a modular push to "preview" for "ffffffff-ffff-ffff-ffff-ffffffffffff"' in caplog.text ) + mock_wait_publish.assert_called_once() mock_ensure_publish.assert_called_once() # Configure request From da1a7505f17a2a6a015dc3743f776f1b7d002b3e Mon Sep 17 00:00:00 2001 From: Jonathan Gangi Date: Tue, 7 Oct 2025 16:02:55 -0300 Subject: [PATCH 3/3] Azure: make wait_for_job_completion retry uniform Use the same retry values declared in the `AzureService` constructor for `_wait_for_job_completion` Signed-off-by: Jonathan Gangi --- cloudpub/ms_azure/service.py | 48 ++++++++++++++++++++++------------ tests/ms_azure/test_service.py | 25 +++++++++++++----- 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/cloudpub/ms_azure/service.py b/cloudpub/ms_azure/service.py index fb22d3a..c76221c 100644 --- a/cloudpub/ms_azure/service.py +++ b/cloudpub/ms_azure/service.py @@ -9,7 +9,7 @@ from tenacity import RetryError, Retrying, retry from tenacity.retry import retry_if_exception_type, retry_if_result from tenacity.stop import stop_after_attempt, stop_after_delay -from tenacity.wait import wait_chain, wait_fixed +from tenacity.wait import wait_fixed from cloudpub.common import BaseService from cloudpub.error import ConflictError, InvalidStateError, NotFoundError, Timeout @@ -165,15 +165,27 @@ def _query_job_details(self, job_id: str) -> ConfigureStatus: log.debug("Query Job details response: %s", parsed_resp) return parsed_resp - @retry( - retry=retry_if_result(predicate=is_azure_job_not_complete), - wait=wait_chain( - *[wait_fixed(wait=60)] # First wait for 1 minute # noqa: W503 - + [wait_fixed(wait=10 * 60)] # Then wait for 10 minutes # noqa: W503 - + [wait_fixed(wait=30 * 60)] # Finally wait each 30 minutes # noqa: W503 - ), - stop=stop_after_delay(max_delay=60 * 60 * 24 * 7), # Give up after retrying for 7 days - ) + def query_job_status(self, job_id: str) -> ConfigureStatus: + """Query the job status for a given Job ID. + + It will raise error if any invalid state is detected. + + Args: + job_id (str): The job ID to query details from. + + Returns: + ConfigureStatus: The ConfigureStatus from JobID + Raises: + InvalidStateError: If the job has failed. + """ + job_details = self._query_job_details(job_id=job_id) + if job_details.job_result == "failed": + error_message = f"Job {job_id} failed: \n{job_details.errors}" + self._raise_error(InvalidStateError, error_message) + elif job_details.job_result == "succeeded": + log.debug("Job %s succeeded", job_id) + return job_details + def _wait_for_job_completion(self, job_id: str) -> ConfigureStatus: """ Wait until the specified job ID is complete. @@ -192,13 +204,15 @@ def _wait_for_job_completion(self, job_id: str) -> ConfigureStatus: Raises: InvalidStateError if the job failed """ - job_details = self._query_job_details(job_id=job_id) - if job_details.job_result == "failed": - error_message = f"Job {job_id} failed: \n{job_details.errors}" - self._raise_error(InvalidStateError, error_message) - elif job_details.job_result == "succeeded": - log.debug("Job %s succeeded", job_id) - return job_details + r = Retrying( + retry=retry_if_result(predicate=is_azure_job_not_complete), + wait=wait_fixed(self.retry_interval), + stop=stop_after_delay(max_delay=self.retry_timeout), + ) + try: + return r(self.query_job_status, job_id) + except RetryError: + self._raise_error(Timeout, f"Time out waiting for job {job_id}") def configure(self, resources: List[AzureResource]) -> ConfigureStatus: """ diff --git a/tests/ms_azure/test_service.py b/tests/ms_azure/test_service.py index 252ca25..25567c7 100644 --- a/tests/ms_azure/test_service.py +++ b/tests/ms_azure/test_service.py @@ -258,12 +258,10 @@ def test_query_job_details_server_error( assert "Got HTTP 502 from server when querying job job-id status." in caplog.text assert "Considering the job_status as \"pending\"." in caplog.text - @mock.patch("cloudpub.ms_azure.utils.is_azure_job_not_complete") @mock.patch("cloudpub.ms_azure.AzureService._query_job_details") def test_wait_for_job_completion_successful_completion( self, mock_job_details: mock.MagicMock, - mock_is_job_not_complete: mock.MagicMock, azure_service: AzureService, caplog: LogCaptureFixture, job_details_running_obj: ConfigureStatus, @@ -277,7 +275,6 @@ def test_wait_for_job_completion_successful_completion( job_details_running_obj, ] - azure_service._wait_for_job_completion.retry.sleep = mock.Mock() # type: ignore job_id = "job_id_111" with caplog.at_level(logging.DEBUG): res = azure_service._wait_for_job_completion(job_id=job_id) @@ -286,12 +283,29 @@ def test_wait_for_job_completion_successful_completion( assert f"Job {job_id} failed" not in caplog.text assert f"Job {job_id} succeeded" in caplog.text - @mock.patch("cloudpub.ms_azure.utils.is_azure_job_not_complete") + @mock.patch("cloudpub.ms_azure.AzureService._query_job_details") + def test_wait_for_job_completion_successful_timeout( + self, + mock_job_details: mock.MagicMock, + azure_service: AzureService, + caplog: LogCaptureFixture, + job_details_running_obj: ConfigureStatus, + job_details_completed_successfully_obj: ConfigureStatus, + ) -> None: + mock_job_details.side_effect = [job_details_running_obj for _ in range(15)] + azure_service.retry_interval = 0.1 + azure_service.retry_timeout = 0.5 + + job_id = "job_id_111" + err = f"Time out waiting for job {job_id}" + + with pytest.raises(Timeout, match=err): + azure_service._wait_for_job_completion(job_id=job_id) + @mock.patch("cloudpub.ms_azure.AzureService._query_job_details") def test_get_job_details_after_failed_completion( self, mock_job_details: mock.MagicMock, - mock_is_job_not_completed: mock.MagicMock, azure_service: AzureService, caplog: LogCaptureFixture, job_details_running_obj: ConfigureStatus, @@ -306,7 +320,6 @@ def test_get_job_details_after_failed_completion( job_details_running_obj, ] - azure_service._wait_for_job_completion.retry.sleep = mock.Mock() # type: ignore job_id = "job_id_111" with caplog.at_level(logging.ERROR): with pytest.raises(InvalidStateError) as e_info: