From 340d42b67fb31569aabb8413466b89e916374792 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 2 Sep 2025 12:17:37 +0200 Subject: [PATCH 1/4] Add tests for Python versions 3.11, 3.12, 3.13 --- .github/workflows/ci.yml | 25 ++++++++++++++++++++----- tox.ini | 3 ++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8ed7f3a41..138ae4e4a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,10 @@ jobs: name: - Python 3.9 Tests - Python 3.10 Tests - - Python 3.9 Tests Coverage + - Python 3.11 Tests + - Python 3.12 Tests + - Python 3.13 Tests + - Python 3.12 Tests Coverage - Code Checks include: - name: Python 3.9 Tests @@ -39,12 +42,24 @@ jobs: python: '3.10' toxdir: cli toxenv: py310-nocov - - name: Python 3.9 Tests Coverage - python: 3.9 + - name: Python 3.11 Tests + python: '3.11' + toxdir: cli + toxenv: py311-nocov + - name: Python 3.12 Tests + python: '3.12' + toxdir: cli + toxenv: py312-nocov + - name: Python 3.13 Tests + python: '3.13' toxdir: cli - toxenv: py39-cov + toxenv: py313-nocov + - name: Python 3.12 Tests Coverage + python: 3.12 + toxdir: cli + toxenv: py312-cov - name: Code Checks - python: 3.9 + python: 3.12 toxdir: cli toxenv: code-linters diff --git a/tox.ini b/tox.ini index 2fc514e6f..6e19dc848 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = - py{39,310}-cov + py{39,310,311,312,313}-cov code-linters # Default testenv. Used to run tests on all python versions. @@ -14,6 +14,7 @@ usedevelop = allowlist_externals = bash deps = + setuptools -r tests/requirements.txt commands = nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/ From e45dfd87e92e3521a7c62890fc728b8853ad431b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 17 Sep 2025 14:54:36 -0400 Subject: [PATCH 2/4] Bump version to 3.15.0 (#675) Co-authored-by: hgreebe <141743196+hgreebe@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8d1d7db04..118dfbc65 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def read(fname): "clustermgtd = slurm_plugin.clustermgtd:main", "computemgtd = slurm_plugin.computemgtd:main", ] -version = "3.14.0" +version = "3.15.0" requires = ["boto3>=1.7.55", "retrying>=1.3.3"] setup( From 9457ba4e7e26a858c6591718790ea517482e2ea0 Mon Sep 17 00:00:00 2001 From: hgreebe <141743196+hgreebe@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:03:14 -0400 Subject: [PATCH 3/4] Add error to point user to slurm resume log (#676) * Add error to point user to slurm resume log (cherry picked from commit 84ec039d06a21f4bcd817149eeaf07d70dec8a48) * Fix unit tests (cherry picked from commit c84aeb5bd19cce06bde6bd2bda87afe1bf47c8af) * Fix code linter (cherry picked from commit bdc8706239bf0af11031127a6fd3db969eec1676) * Update CHANGELOG * Fix linter errors * Add unit test for logs in clustermgtd --- .flake8 | 3 +++ CHANGELOG.md | 6 ++++++ src/slurm_plugin/clustermgtd.py | 3 ++- src/slurm_plugin/resume.py | 6 +++++- tests/slurm_plugin/test_clustermgtd.py | 7 +++++++ tests/slurm_plugin/test_resume.py | 6 +++++- 6 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.flake8 b/.flake8 index f8b39af54..44031616a 100644 --- a/.flake8 +++ b/.flake8 @@ -18,6 +18,9 @@ ignore = W503, # N818: exception name should be named with an Error suffix N818 + # B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`. + # Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525 + B042 exclude = .tox, .git, diff --git a/CHANGELOG.md b/CHANGELOG.md index 92a401a4c..8dd9c0e3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG This file is used to list changes made in each version of the aws-parallelcluster-node package. +3.15.0 +------ + +**CHANGES** +- Direct users to slurm_resume log to see EC2 error codes if no instances are launched. + 3.14.0 ------ diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index ce22febdf..e9a217bb2 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources( return log.info( "The following compute resources are in down state due to insufficient capacity: %s, " - "compute resources will be reset after insufficient capacity timeout (%s seconds) expired", + "compute resources will be reset after insufficient capacity timeout (%s seconds) expired. " + "Check the slurm_resume log for EC2 error codes.", self._insufficient_capacity_compute_resources, self._config.insufficient_capacity_timeout, ) diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py index d4c24cc52..cb9b22e7c 100644 --- a/src/slurm_plugin/resume.py +++ b/src/slurm_plugin/resume.py @@ -227,7 +227,11 @@ def _resume(arg_nodes, resume_config, slurm_resume): print_with_count(failed_nodes), ) for error_code, node_list in instance_manager.failed_nodes.items(): - _handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes") + _handle_failed_nodes( + node_list, + reason=f"(Code:{error_code})Failure when resuming nodes - " + f"Check the slurm_resume log for EC2 error codes", + ) event_publisher = ClusterEventPublisher.create_with_default_publisher( event_logger, diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py index 422c516cb..36d4783a7 100644 --- a/tests/slurm_plugin/test_clustermgtd.py +++ b/tests/slurm_plugin/test_clustermgtd.py @@ -3533,6 +3533,13 @@ def test_reset_timeout_expired_compute_resources( assert_that(cluster_manager._insufficient_capacity_compute_resources).is_equal_to( expected_insufficient_capacity_compute_resources ) + + if expected_insufficient_capacity_compute_resources: + assert ( + "compute resources will be reset after insufficient capacity timeout (20 seconds) expired. " + "Check the slurm_resume log for EC2 error codes." + ) in caplog.text + if expected_power_save_node_list: power_save_mock.assert_called_with( expected_power_save_node_list, reason="Enabling node since insufficient capacity timeout expired" diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index c33f9d50a..5601e864d 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -448,7 +448,11 @@ def test_resume_launch( if expected_failed_nodes: for error_code, nodeset in expected_failed_nodes.items(): mock_handle_failed_nodes_calls.append( - call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes") + call( + nodeset, + reason=f"(Code:{error_code})Failure when resuming nodes - " + f"Check the slurm_resume log for EC2 error codes", + ) ) mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls) mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size) From 06b5b48a4bd3442d449f81098b2af7d1b93d7ba7 Mon Sep 17 00:00:00 2001 From: Francesco De Martino Date: Tue, 2 Sep 2025 12:17:37 +0200 Subject: [PATCH 4/4] Add tests for Python versions 3.11, 3.12, 3.13 --- .github/workflows/ci.yml | 25 ++++++++++++++++++++----- tox.ini | 3 ++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8ed7f3a41..138ae4e4a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,7 +28,10 @@ jobs: name: - Python 3.9 Tests - Python 3.10 Tests - - Python 3.9 Tests Coverage + - Python 3.11 Tests + - Python 3.12 Tests + - Python 3.13 Tests + - Python 3.12 Tests Coverage - Code Checks include: - name: Python 3.9 Tests @@ -39,12 +42,24 @@ jobs: python: '3.10' toxdir: cli toxenv: py310-nocov - - name: Python 3.9 Tests Coverage - python: 3.9 + - name: Python 3.11 Tests + python: '3.11' + toxdir: cli + toxenv: py311-nocov + - name: Python 3.12 Tests + python: '3.12' + toxdir: cli + toxenv: py312-nocov + - name: Python 3.13 Tests + python: '3.13' toxdir: cli - toxenv: py39-cov + toxenv: py313-nocov + - name: Python 3.12 Tests Coverage + python: 3.12 + toxdir: cli + toxenv: py312-cov - name: Code Checks - python: 3.9 + python: 3.12 toxdir: cli toxenv: code-linters diff --git a/tox.ini b/tox.ini index 2fc514e6f..6e19dc848 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = - py{39,310}-cov + py{39,310,311,312,313}-cov code-linters # Default testenv. Used to run tests on all python versions. @@ -14,6 +14,7 @@ usedevelop = allowlist_externals = bash deps = + setuptools -r tests/requirements.txt commands = nocov: pytest -n auto -l -v --basetemp={envtmpdir} --html=report.html --ignore=src tests/