From 84ec039d06a21f4bcd817149eeaf07d70dec8a48 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 30 Oct 2025 07:36:55 -0400 Subject: [PATCH 1/4] Add error to point user to slurm resume log --- src/slurm_plugin/clustermgtd.py | 3 ++- src/slurm_plugin/resume.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/slurm_plugin/clustermgtd.py b/src/slurm_plugin/clustermgtd.py index ce22febd..6b766483 100644 --- a/src/slurm_plugin/clustermgtd.py +++ b/src/slurm_plugin/clustermgtd.py @@ -1262,7 +1262,8 @@ def _reset_timeout_expired_compute_resources( return log.info( "The following compute resources are in down state due to insufficient capacity: %s, " - "compute resources will be reset after insufficient capacity timeout (%s seconds) expired", + "compute resources will be reset after insufficient capacity timeout (%s seconds) expired." + "Check the slurm_resume log for ec2 error codes.", self._insufficient_capacity_compute_resources, self._config.insufficient_capacity_timeout, ) diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py index d4c24cc5..f09e9aa4 100644 --- a/src/slurm_plugin/resume.py +++ b/src/slurm_plugin/resume.py @@ -227,7 +227,10 @@ def _resume(arg_nodes, resume_config, slurm_resume): print_with_count(failed_nodes), ) for error_code, node_list in instance_manager.failed_nodes.items(): - _handle_failed_nodes(node_list, reason=f"(Code:{error_code})Failure when resuming nodes") + _handle_failed_nodes( + node_list, + reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes", + ) event_publisher = ClusterEventPublisher.create_with_default_publisher( event_logger, From c84aeb5bd19cce06bde6bd2bda87afe1bf47c8af Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 30 Oct 2025 08:32:32 -0400 Subject: [PATCH 2/4] Fix unit tests --- tests/slurm_plugin/test_resume.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index c33f9d50..37efbe8b 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -448,7 +448,10 @@ def test_resume_launch( if expected_failed_nodes: for error_code, nodeset in expected_failed_nodes.items(): mock_handle_failed_nodes_calls.append( - call(nodeset, reason=f"(Code:{error_code})Failure when resuming nodes") + call( + nodeset, + reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes", + ) ) mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls) mock_terminate_instances.assert_called_with(ANY, mock_resume_config.terminate_max_batch_size) From bdc8706239bf0af11031127a6fd3db969eec1676 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Wed, 29 Oct 2025 12:51:26 -0400 Subject: [PATCH 3/4] Fix code linter --- .flake8 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.flake8 b/.flake8 index f8b39af5..44031616 100644 --- a/.flake8 +++ b/.flake8 @@ -18,6 +18,9 @@ ignore = W503, # N818: exception name should be named with an Error suffix N818 + # B042: Exception class with `__init__` should pass all args to `super().__init__()` in order to work with `copy.copy()`. + # Affected by false positive, https://github.com/PyCQA/flake8-bugbear/issues/525 + B042 exclude = .tox, .git, From 49c6b6f93a06dd0a0a1b2e9ebd459e6f53686ce8 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 30 Oct 2025 08:51:36 -0400 Subject: [PATCH 4/4] Update CHANGELOG --- CHANGELOG.md | 6 ++++++ src/slurm_plugin/resume.py | 3 ++- tests/slurm_plugin/test_resume.py | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 92a401a4..8dd9c0e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG This file is used to list changes made in each version of the aws-parallelcluster-node package. +3.15.0 +------ + +**CHANGES** +- Direct users to slurm_resume log to see EC2 error codes if no instances are launched. + 3.14.0 ------ diff --git a/src/slurm_plugin/resume.py b/src/slurm_plugin/resume.py index f09e9aa4..644ea160 100644 --- a/src/slurm_plugin/resume.py +++ b/src/slurm_plugin/resume.py @@ -229,7 +229,8 @@ def _resume(arg_nodes, resume_config, slurm_resume): for error_code, node_list in instance_manager.failed_nodes.items(): _handle_failed_nodes( node_list, - reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes", + reason=f"(Code:{error_code})Failure when resuming nodes - " + f"Check the slurm_resume log for ec2 error codes", ) event_publisher = ClusterEventPublisher.create_with_default_publisher( diff --git a/tests/slurm_plugin/test_resume.py b/tests/slurm_plugin/test_resume.py index 37efbe8b..20675f27 100644 --- a/tests/slurm_plugin/test_resume.py +++ b/tests/slurm_plugin/test_resume.py @@ -450,7 +450,8 @@ def test_resume_launch( mock_handle_failed_nodes_calls.append( call( nodeset, - reason=f"(Code:{error_code})Failure when resuming nodes - Check the slurm_resume log for ec2 error codes", + reason=f"(Code:{error_code})Failure when resuming nodes - " + f"Check the slurm_resume log for ec2 error codes", ) ) mock_handle_failed_nodes.assert_has_calls(mock_handle_failed_nodes_calls)