Skip to content

Commit 61ff8e4

Browse files
committed
Use systemd timers to trigger updates
1 parent 695ff9a commit 61ff8e4

File tree

10 files changed

+107
-9
lines changed

10 files changed

+107
-9
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[Unit]
2+
Description=Check file modification time every minute
3+
4+
[Timer]
5+
AccuracySec=1s
6+
OnActiveSec=60sec
7+
OnUnitActiveSec=60sec
8+
Unit=check-update.service
9+
10+
[Install]
11+
WantedBy=timers.target

cookbooks/aws-parallelcluster-computefleet/recipes/config.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,7 @@
1212
# limitations under the License.
1313

1414
include_recipe 'aws-parallelcluster-computefleet::fleet_status'
15+
16+
if ['ComputeFleet'].include?(node['cluster']['node_type'])
17+
include_recipe 'aws-parallelcluster-computefleet::config_check_update_systemd_service'
18+
end
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# frozen_string_literal: true
2+
3+
#
4+
# Cookbook:: aws-parallelcluster-slurm
5+
# Recipe:: config_compute
6+
#
7+
# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
8+
#
9+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
10+
# License. A copy of the License is located at
11+
#
12+
# http://aws.amazon.com/apache2.0/
13+
#
14+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
15+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
template '/etc/systemd/system/check-update.service' do
19+
source 'check_update/check-update.service.erb'
20+
owner 'root'
21+
group 'root'
22+
mode '0644'
23+
end
24+
25+
cookbook_file '/etc/systemd/system/check-update.timer' do
26+
source 'check_update/check-update.timer'
27+
owner 'root'
28+
group 'root'
29+
mode '0644'
30+
action :create
31+
end
32+
33+
file node['cluster']['shared_update_path'] do
34+
content ''
35+
owner 'root'
36+
group 'root'
37+
mode '0644'
38+
action :create_if_missing
39+
end
40+
41+
file node['cluster']['update_checkpoint'] do
42+
content ''
43+
owner 'root'
44+
group 'root'
45+
mode '0644'
46+
action :create_if_missing
47+
end
48+
49+
service 'check-update.timer' do
50+
action [:enable, :start]
51+
end
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[Unit]
2+
Description=Check for recent file modifications
3+
4+
[Service]
5+
Type=oneshot
6+
TimeoutStartSec=30
7+
ExecStart=/bin/bash -c '\
8+
SHARED_FILE="<%= node['cluster']['shared_update_path'] %>"; \
9+
LOCAL_CHECKPOINT="<%= node['cluster']['update_checkpoint'] %>"; \
10+
\
11+
[ ! -f "$SHARED_FILE" ] && exit 0; \
12+
\
13+
CURRENT_UPDATE=$(cat "$SHARED_FILE") || exit 0; \
14+
LAST_APPLIED=$([ -f "$LOCAL_CHECKPOINT" ] && cat "$LOCAL_CHECKPOINT" || echo ""); \
15+
\
16+
if [ "$CURRENT_UPDATE" != "$LAST_APPLIED" ]; then \
17+
echo "$CURRENT_UPDATE" > "$LOCAL_CHECKPOINT" && <%= node['cluster']['scripts_dir'] %>/cfn-hup-update-action.sh; \
18+
fi'
19+
20+
[Install]
21+
WantedBy=multi-user.target

cookbooks/aws-parallelcluster-environment/resources/cfn_hup_configuration.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@
6868
cfn_init_role: instance_role_name,
6969
# ComputeFleet specific variables
7070
update_hook_script_dir: node['cluster']['scripts_dir'],
71-
node_bootstrap_timeout: node['cluster']['compute_node_bootstrap_timeout'] || node['cluster']['Timeout']
71+
node_bootstrap_timeout: node['cluster']['compute_node_bootstrap_timeout'] || node['cluster']['Timeout'],
72+
update_dir: node['cluster']['shared_update_path']
7273
)
7374
end
7475
end
@@ -94,7 +95,8 @@
9495
mode '0700'
9596
variables(
9697
monitor_shared_dir: monitor_shared_dir,
97-
launch_template_resource_id: node['cluster']['launch_template_id']
98+
launch_template_resource_id: node['cluster']['launch_template_id'],
99+
update_dir: node['cluster']['shared_update_path']
98100
)
99101
end
100102
end

cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ triggers=post.update
33
<% case node['cluster']['node_type'] -%>
44
<% when 'HeadNode', 'LoginNode' -%>
55
path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation::Init
6-
action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %>
7-
<% when 'ComputeFleet' -%>
8-
path=Resources.<%= @launch_template_resource_id %>
9-
action=timeout <%= @node_bootstrap_timeout %> <%= @update_hook_script_dir %>/cfn-hup-update-action.sh
6+
action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin;. /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %>
107
<% end %>
118
runas=root

cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%>
22
# Local modifications could be overwritten.
3-
<%# HeadNode, ComputeFleet, LoginNode -%>
4-
<% if @cfnhup_enabled -%>
3+
<% case node['cluster']['node_type'] -%>
4+
<% when 'HeadNode', 'LoginNode' -%>
55
[program:cfn-hup]
66
command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh
77
autorestart = true

cookbooks/aws-parallelcluster-shared/attributes/cluster.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
default['cluster']['log_base_dir'] = '/var/log/parallelcluster'
1010
default['cluster']['etc_dir'] = '/etc/parallelcluster'
1111

12+
# Shared file used to manage inplace updates
13+
default['cluster']['shared_update_path'] = "#{node['cluster']['shared_dir']}/check_update"
14+
default['cluster']['update_checkpoint'] = "#{node['cluster']['scripts_dir']}/update_checkpoint"
15+
1216
# Slurm_plugin_dir is used by slurm cookbook and custom_actions recipe
1317
default['cluster']['slurm_plugin_dir'] = "#{node['cluster']['etc_dir']}/slurm_plugin"
1418

cookbooks/aws-parallelcluster-slurm/libraries/helpers.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def wait_cluster_ready
173173
" --config-version #{node['cluster']['cluster_config_version']}" \
174174
" --region #{node['cluster']['region']}"
175175
timeout 30
176-
retries 10
176+
retries 20
177177
retry_delay 90
178178
end
179179
end

cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@
2020
not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? }
2121
end
2222

23+
# Write the new config version to shared storage to signal compute nodes to update
24+
file node['cluster']['shared_update_path'] do
25+
content node['cluster']['cluster_config_version']
26+
owner 'root'
27+
group 'root'
28+
mode '0644'
29+
end
30+
2331
ruby_block "update_shared_storages" do
2432
block do
2533
run_context.include_recipe 'aws-parallelcluster-environment::update_shared_storages'

0 commit comments

Comments
 (0)