From 5fd0c474a9c65deb488227585916069b6292994a Mon Sep 17 00:00:00 2001
From: Helena Greebe <hgreebe@amazon.com>
Date: Thu, 30 Oct 2025 07:36:55 -0400
Subject: [PATCH 1/6] Add error to point user to slurm resume log

(cherry picked from commit 84ec039d06a21f4bcd817149eeaf07d70dec8a48)
---
 src/slurm_plugin/clustermgtd.py | 3 ++-
 src/slurm_plugin/resume.py      | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py
index ce22febd..6b766483 100644
--- a/src/slurm_plugin/clustermgtd.py
+++ b/src/slurm_plugin/clustermgtd.py
@@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources(
             return
         log.info(
             "The following compute resources are in down state due to insufficient capacity: %s, "
-            "compute resources will be reset after insufficient capacity timeout (%s seconds) expired",
+            "compute resources will be reset after insufficient capacity timeout (%s seconds) expired."
+            "Check the slurm_resume log for ec2 error codes.",
             self._insufficient_capacity_compute_resources,
             self._config.insufficient_capacity_timeout,
         )
diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py
index d4c24cc5..f09e9aa4 100644
--- a/src/slurm_plugin/resume.py
+++ b/src/slurm_plugin/resume.py
@@ -227,7 +227,10 @@ def _resume(arg_nodes, resume_config, slurm_resume):
             print_with_count(failed_nodes),
         )
         for error_code, node_list in instance_manager.failed_nodes.items():
-            _handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes")
+            _handle_failed_nodes(
+                node_list,
+                reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes",
+            )
 
         event_publisher = ClusterEventPublisher.create_with_default_publisher(
             event_logger,

From e71824ab9b4f220ba978215c175d2c370ff89a21 Mon Sep 17 00:00:00 2001
From: Helena Greebe <hgreebe@amazon.com>
Date: Thu, 30 Oct 2025 08:32:32 -0400
Subject: [PATCH 2/6] Fix unit tests

(cherry picked from commit c84aeb5bd19cce06bde6bd2bda87afe1bf47c8af)
---
 tests/slurm_plugin/test_resume.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py
index c33f9d50..37efbe8b 100644
--- a/tests/slurm_plugin/test_resume.py
+++ b/tests/slurm_plugin/test_resume.py
@@ -448,7 +448,10 @@ def test_resume_launch(
         if expected_failed_nodes:
             for error_code, nodeset in expected_failed_nodes.items():
                 mock_handle_failed_nodes_calls.append(
-                    call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes")
+                    call(
+                        nodeset,
+                        reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes",
+                    )
                 )
             mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)
             mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size)

From 3cc569f0c1b154406ce03483ee500e7f5872e23f Mon Sep 17 00:00:00 2001
From: Helena Greebe <hgreebe@amazon.com>
Date: Wed, 29 Oct 2025 12:51:26 -0400
Subject: [PATCH 3/6] Fix code linter

(cherry picked from commit bdc8706239bf0af11031127a6fd3db969eec1676)
---
 .flake8 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.flake8 b/.flake8
index f8b39af5..44031616 100644
--- a/.flake8
+++ b/.flake8
@@ -18,6 +18,9 @@ ignore =
     W503,
     # N818: exception name should be named with an Error suffix
     N818
+    # B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`.
+    # Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525
+    B042
 exclude =
     .tox,
     .git,

From dff72fa1cbdaf625c3d2e44f65838e1e5a0e6163 Mon Sep 17 00:00:00 2001
From: Helena Greebe <hgreebe@amazon.com>
Date: Thu, 30 Oct 2025 08:56:45 -0400
Subject: [PATCH 4/6] Update CHANGELOG

---
 CHANGELOG.md                      | 6 ++++++
 src/slurm_plugin/clustermgtd.py   | 4 ++--
 src/slurm_plugin/resume.py        | 3 ++-
 tests/slurm_plugin/test_resume.py | 3 ++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92a401a4..8dd9c0e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
 
 This file is used to list changes made in each version of the aws-parallelcluster-node package.
 
+3.15.0
+------
+
+**CHANGES**
+- Direct users to slurm_resume log to see EC2 error codes if no instances are launched.
+
 3.14.0
 ------
 
diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py
index 6b766483..e9a217bb 100644
--- a/src/slurm_plugin/clustermgtd.py
+++ b/src/slurm_plugin/clustermgtd.py
@@ -1262,8 +1262,8 @@ def _reset_timeout_expired_compute_resources(
             return
         log.info(
             "The following compute resources are in down state due to insufficient capacity: %s, "
-            "compute resources will be reset after insufficient capacity timeout (%s seconds) expired."
-            "Check the slurm_resume log for ec2 error codes.",
+            "compute resources will be reset after insufficient capacity timeout (%s seconds) expired. "
+            "Check the slurm_resume log for EC2 error codes.",
             self._insufficient_capacity_compute_resources,
             self._config.insufficient_capacity_timeout,
         )
diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py
index f09e9aa4..9498b5d1 100644
--- a/src/slurm_plugin/resume.py
+++ b/src/slurm_plugin/resume.py
@@ -229,7 +229,8 @@ def _resume(arg_nodes, resume_config, slurm_resume):
         for error_code, node_list in instance_manager.failed_nodes.items():
             _handle_failed_nodes(
                 node_list,
-                reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes",
+                reason=f"(Code:{error_code})Failure when resuming nodes - "
+                       f"Check the slurm_resume log for EC2 error codes",
             )
 
         event_publisher = ClusterEventPublisher.create_with_default_publisher(
diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py
index 37efbe8b..d1de1ea9 100644
--- a/tests/slurm_plugin/test_resume.py
+++ b/tests/slurm_plugin/test_resume.py
@@ -450,7 +450,8 @@ def test_resume_launch(
                 mock_handle_failed_nodes_calls.append(
                     call(
                         nodeset,
-                        reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes",
+                        reason=f"(Code:{error_code})Failure when resuming nodes - "
+                               f"Check the slurm_resume log for EC2 error codes",
                     )
                 )
             mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)

From 73d764bf2e7200cca6d3fef924001d20769bee52 Mon Sep 17 00:00:00 2001
From: Helena Greebe <hgreebe@amazon.com>
Date: Thu, 30 Oct 2025 09:00:46 -0400
Subject: [PATCH 5/6] Fix linter errors

---
 src/slurm_plugin/resume.py        | 2 +-
 tests/slurm_plugin/test_resume.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py
index 9498b5d1..cb9b22e7 100644
--- a/src/slurm_plugin/resume.py
+++ b/src/slurm_plugin/resume.py
@@ -230,7 +230,7 @@ def _resume(arg_nodes, resume_config, slurm_resume):
             _handle_failed_nodes(
                 node_list,
                 reason=f"(Code:{error_code})Failure when resuming nodes - "
-                       f"Check the slurm_resume log for EC2 error codes",
+                f"Check the slurm_resume log for EC2 error codes",
             )
 
         event_publisher = ClusterEventPublisher.create_with_default_publisher(
diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py
index d1de1ea9..5601e864 100644
--- a/tests/slurm_plugin/test_resume.py
+++ b/tests/slurm_plugin/test_resume.py
@@ -451,7 +451,7 @@ def test_resume_launch(
                     call(
                         nodeset,
                         reason=f"(Code:{error_code})Failure when resuming nodes - "
-                               f"Check the slurm_resume log for EC2 error codes",
+                        f"Check the slurm_resume log for EC2 error codes",
                     )
                 )
             mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)

From f2f25567c14740a7738c5406a8d8b6aa3decf6cb Mon Sep 17 00:00:00 2001
From: Helena Greebe <hgreebe@amazon.com>
Date: Thu, 30 Oct 2025 13:26:00 -0400
Subject: [PATCH 6/6] Add unit test for logs in clustermgtd

---
 tests/slurm_plugin/test_clustermgtd.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/slurm_plugin/test_clustermgtd.py b/tests/slurm_plugin/test_clustermgtd.py
index 422c516c..36d4783a 100644
--- a/tests/slurm_plugin/test_clustermgtd.py
+++ b/tests/slurm_plugin/test_clustermgtd.py
@@ -3533,6 +3533,13 @@ def test_reset_timeout_expired_compute_resources(
     assert_that(cluster_manager._insufficient_capacity_compute_resources).is_equal_to(
         expected_insufficient_capacity_compute_resources
     )
+
+    if expected_insufficient_capacity_compute_resources:
+        assert (
+            "compute resources will be reset after insufficient capacity timeout (20 seconds) expired. "
+            "Check the slurm_resume log for EC2 error codes."
+        ) in caplog.text
+
     if expected_power_save_node_list:
         power_save_mock.assert_called_with(
             expected_power_save_node_list, reason="Enabling node since insufficient capacity timeout expired"