Skip to content

Commit fbdde2a

Browse files
authored
Upgrade Slurm to version 22.05.6 (#473)
Revert "Implement workaround for dynamic nodes stuck in bad Slurm state" (commit dedda89) This is because SchedMD merged a fix in Slurm 22.05.6. Signed-off-by: Jacopo De Amicis <jdamicis@amazon.it>
1 parent 9a2d828 commit fbdde2a

File tree

3 files changed

+4
-21
lines changed

3 files changed

+4
-21
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ This file is used to list changes made in each version of the aws-parallelcluste
99
**ENHANCEMENTS**
1010
- Add support for launching nodes across multiple availability zones to increase capacity availability.
1111

12+
**CHANGES**
13+
- Do not consider dynamic nodes in IDLE+CLOUD+COMPLETING+POWER_DOWN+NOT_RESPONDING as unhealthy anymore.
14+
- The root cause has been fixed in Slurm 22.05.6.
15+
1216
3.3.0
1317
------
1418

src/slurm_plugin/slurm_resources.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ class SlurmNode(metaclass=ABCMeta):
9898
SLURM_SCONTROL_POWER_STATES = [{"IDLE", "CLOUD", "POWERED_DOWN"}, {"IDLE", "CLOUD", "POWERED_DOWN", "POWER_DOWN"}]
9999
SLURM_SCONTROL_REBOOT_REQUESTED_STATE = "REBOOT_REQUESTED"
100100
SLURM_SCONTROL_REBOOT_ISSUED_STATE = "REBOOT_ISSUED"
101-
SLURM_SCONTROL_BUG_STUCK_DYNAMIC_NODES_STATE = {"IDLE", "CLOUD", "COMPLETING", "POWER_DOWN", "NOT_RESPONDING"}
102101

103102
EC2_ICE_ERROR_CODES = {
104103
"InsufficientInstanceCapacity",
@@ -421,9 +420,6 @@ def is_state_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn
421420
if log_warn_if_unhealthy:
422421
logger.warning("Node state check: node %s in DOWN, node state: %s", self, self.state_string)
423422
return False
424-
# Workaround for IDLE+CLOUD+COMPLETING+POWER_DOWN+NOT_RESPONDING bug
425-
if self.is_stuck_in_bug_state():
426-
return False
427423
return True
428424

429425
def is_healthy(self, terminate_drain_nodes, terminate_down_nodes, log_warn_if_unhealthy=True):
@@ -465,10 +461,6 @@ def needs_reset_when_inactive(self):
465461
"""Check if the node need to be reset if node is inactive."""
466462
return self.is_nodeaddr_set() or (not (self.is_power() or self.is_powering_down() or self.is_down()))
467463

468-
def is_stuck_in_bug_state(self):
469-
"""Check if the node is stuck in a bug state IDLE+CLOUD+COMPLETING+POWER_DOWN+NOT_RESPONDING."""
470-
return self.states == self.SLURM_SCONTROL_BUG_STUCK_DYNAMIC_NODES_STATE
471-
472464

473465
class EC2InstanceHealthState:
474466
def __init__(self, id, state, instance_status, system_status, scheduled_events):

tests/slurm_plugin/slurm_resources/test_slurm_resources.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -859,18 +859,6 @@ def test_slurm_node_is_bootstrap_failure(
859859
None,
860860
True,
861861
),
862-
# Workaround for nodes stuck in IDLE+CLOUD+COMPLETING+POWER_DOWN+NOT_RESPONDING
863-
(
864-
DynamicNode(
865-
"queue-dy-c5xlarge-1",
866-
"queue-dy-c5xlarge-1",
867-
"queue-dy-c5xlarge-1",
868-
"IDLE+CLOUD+COMPLETING+POWER_DOWN+NOT_RESPONDING",
869-
"queue",
870-
),
871-
None,
872-
False,
873-
),
874862
],
875863
ids=[
876864
"basic",
@@ -883,7 +871,6 @@ def test_slurm_node_is_bootstrap_failure(
883871
"power_unhealthy1",
884872
"power_unhealthy2",
885873
"power_healthy",
886-
"dynamic_node_stuck_in_bug_state",
887874
],
888875
)
889876
def test_slurm_node_is_healthy(node, instance, expected_result):

0 commit comments

Comments
 (0)