diff --git a/CHANGELOG.md b/CHANGELOG.md index c1e3217991..99a60e3b60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste 3.15.0 ------ +**CHANGES** +- Replace cfn-hup in compute nodes with systemd timers to support in place updates. + 3.14.1 ------ diff --git a/cookbooks/aws-parallelcluster-computefleet/files/check_update/check-update.timer b/cookbooks/aws-parallelcluster-computefleet/files/check_update/check-update.timer new file mode 100644 index 0000000000..22818a18af --- /dev/null +++ b/cookbooks/aws-parallelcluster-computefleet/files/check_update/check-update.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Check file modification time every minute + +[Timer] +AccuracySec=1s +OnActiveSec=60sec +OnUnitActiveSec=60sec +Unit=check-update.service + +[Install] +WantedBy=timers.target \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/config.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/config.rb index c29dbdffbc..c58abbb867 100644 --- a/cookbooks/aws-parallelcluster-computefleet/recipes/config.rb +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/config.rb @@ -12,3 +12,7 @@ # limitations under the License. include_recipe 'aws-parallelcluster-computefleet::fleet_status' + +if ['ComputeFleet'].include?(node['cluster']['node_type']) + include_recipe 'aws-parallelcluster-computefleet::config_check_update_systemd_service' +end diff --git a/cookbooks/aws-parallelcluster-computefleet/recipes/config/config_check_update_systemd_service.rb b/cookbooks/aws-parallelcluster-computefleet/recipes/config/config_check_update_systemd_service.rb new file mode 100644 index 0000000000..891dbf5a13 --- /dev/null +++ b/cookbooks/aws-parallelcluster-computefleet/recipes/config/config_check_update_systemd_service.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +# +# Cookbook:: aws-parallelcluster-slurm +# Recipe:: config_compute +# +# Copyright:: 2013-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +template '/etc/systemd/system/check-update.service' do + source 'check_update/check-update.service.erb' + owner 'root' + group 'root' + mode '0644' +end + +cookbook_file '/etc/systemd/system/check-update.timer' do + source 'check_update/check-update.timer' + owner 'root' + group 'root' + mode '0644' + action :create +end + +file node['cluster']['shared_update_path'] do + content '' + owner 'root' + group 'root' + mode '0644' + action :create_if_missing +end + +file node['cluster']['update_checkpoint'] do + content '' + owner 'root' + group 'root' + mode '0644' + action :create_if_missing +end + +service 'check-update.timer' do + action [:enable, :start] +end diff --git a/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/config_check_update_systemd_service_spec.rb b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/config_check_update_systemd_service_spec.rb new file mode 100644 index 0000000000..8a17f54c76 --- /dev/null +++ b/cookbooks/aws-parallelcluster-computefleet/spec/unit/recipes/config_check_update_systemd_service_spec.rb @@ -0,0 +1,94 @@ +require 'spec_helper' + +describe 'aws-parallelcluster-computefleet::config_check_update_systemd_service' do + for_all_oses do |platform, version| + context "on #{platform}#{version}" do + cached(:chef_run) do + runner = runner(platform: platform, version: version) do |node| + node.override['cluster']['node_type'] = 'ComputeFleet' + end + runner.converge(described_recipe) + end + cached(:node) { chef_run.node } + + it 'creates the check-update.service template' do + is_expected.to create_template('/etc/systemd/system/check-update.service') + .with(source: 'check_update/check-update.service.erb') + .with(owner: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + it 'creates the check-update.timer file' do + is_expected.to create_cookbook_file('/etc/systemd/system/check-update.timer') + .with(source: 'check_update/check-update.timer') + .with(owner: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + it 'creates the shared update path file if missing' do + is_expected.to create_file_if_missing(node['cluster']['shared_update_path']) + .with(content: '') + .with(owner: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + it 'creates the local update checkpoint file if missing' do + is_expected.to create_file_if_missing(node['cluster']['update_checkpoint']) + .with(content: '') + .with(owner: 'root') + .with(group: 'root') + .with(mode: '0644') + end + + it 'enables and starts the check-update.timer service' do + is_expected.to enable_service('check-update.timer') + is_expected.to start_service('check-update.timer') + end + + describe 'check-update.service template content' do + it 'has Type=oneshot to prevent concurrent executions' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content('Type=oneshot') + end + + it 'has TimeoutStartSec=30 to handle NFS hangs' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content('TimeoutStartSec=30') + end + + it 'exits gracefully if shared file does not exist' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content('[ ! -f "$SHARED_FILE" ] && exit 0') + end + + it 'exits gracefully if cat fails' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content('CURRENT_UPDATE=$(cat "$SHARED_FILE") || exit 0') + end + + it 'writes checkpoint before running update action' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content('echo "$CURRENT_UPDATE" > "$LOCAL_CHECKPOINT" && ') + end + + it 'references the correct shared update path' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content("SHARED_FILE=\"#{node['cluster']['shared_update_path']}\"") + end + + it 'references the correct local checkpoint path' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content("LOCAL_CHECKPOINT=\"#{node['cluster']['update_checkpoint']}\"") + end + + it 'calls cfn-hup-update-action.sh when update is needed' do + is_expected.to render_file('/etc/systemd/system/check-update.service') + .with_content("#{node['cluster']['scripts_dir']}/cfn-hup-update-action.sh") + end + end + end + end +end diff --git a/cookbooks/aws-parallelcluster-computefleet/templates/check_update/check-update.service.erb b/cookbooks/aws-parallelcluster-computefleet/templates/check_update/check-update.service.erb new file mode 100644 index 0000000000..afcba5e98a --- /dev/null +++ b/cookbooks/aws-parallelcluster-computefleet/templates/check_update/check-update.service.erb @@ -0,0 +1,21 @@ +[Unit] +Description=Check for recent file modifications + +[Service] +Type=oneshot +TimeoutStartSec=30 +ExecStart=/bin/bash -c '\ +SHARED_FILE="<%= node['cluster']['shared_update_path'] %>"; \ +LOCAL_CHECKPOINT="<%= node['cluster']['update_checkpoint'] %>"; \ +\ +[ ! -f "$SHARED_FILE" ] && exit 0; \ +\ +CURRENT_UPDATE=$(cat "$SHARED_FILE") || exit 0; \ +LAST_APPLIED=$([ -f "$LOCAL_CHECKPOINT" ] && cat "$LOCAL_CHECKPOINT" || echo ""); \ +\ +if [ "$CURRENT_UPDATE" != "$LAST_APPLIED" ]; then \ +echo "$CURRENT_UPDATE" > "$LOCAL_CHECKPOINT" && <%= node['cluster']['scripts_dir'] %>/cfn-hup-update-action.sh; \ +fi' + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb index d918ab4334..15cd418e94 100644 --- a/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb +++ b/cookbooks/aws-parallelcluster-environment/templates/cfn_hup_configuration/cfn-hook-update.conf.erb @@ -3,9 +3,6 @@ triggers=post.update <% case node['cluster']['node_type'] -%> <% when 'HeadNode', 'LoginNode' -%> path=Resources.<%= @launch_template_resource_id %>.Metadata.AWS::CloudFormation::Init -action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin; . /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %> -<% when 'ComputeFleet' -%> -path=Resources.<%= @launch_template_resource_id %> -action=timeout <%= @node_bootstrap_timeout %> <%= @update_hook_script_dir %>/cfn-hup-update-action.sh +action=PATH=/usr/local/bin:/bin:/usr/bin:/opt/aws/bin;. /etc/parallelcluster/pcluster_cookbook_environment.sh; $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -v --stack <%= @stack_id %> --resource <%= @launch_template_resource_id %> --configsets update --region <%= @region %> --url <%= @cloudformation_url %> --role <%= @cfn_init_role %> <% end %> runas=root diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb index 8ef48ad12d..0f0ad87850 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/supervisord_config.rb @@ -32,7 +32,6 @@ dcv_port: node['cluster']['dcv_port'], dcv_auth_certificate: node['cluster']['dcv']['authenticator']['certificate'], dcv_auth_private_key: node['cluster']['dcv']['authenticator']['private_key'], - dcv_auth_user: node['cluster']['dcv']['authenticator']['user'], - cfnhup_enabled: cfnhup_enabled? + dcv_auth_user: node['cluster']['dcv']['authenticator']['user'] ) end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb index 7d49664e10..f0ef8bcf57 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/supervisord_config_spec.rb @@ -57,28 +57,6 @@ end end - context "when head node and cfn-hup disabled on fleet" do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - node.override['cluster']['node_type'] = 'HeadNode' - node.override['cluster']['dcv_enabled'] = 'head_node' - node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' - allow_any_instance_of(Object).to receive(:dcv_installed?).and_return(true) - end - runner.converge(described_recipe) - end - cached(:node) { chef_run.node } - - it 'has the correct content' do - is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') - .with_content("[program:cfn-hup]") - .with_content("[program:clustermgtd]") - .with_content("[program:clusterstatusmgtd]") - .with_content("[program:pcluster_dcv_authenticator]") - .with_content("--port 8444") - end - end - context "when compute fleet" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -92,32 +70,16 @@ it 'has the correct content' do is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') - .with_content("[program:cfn-hup]") .with_content("[program:computemgtd]") is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') .with_content("[program:pcluster_dcv_authenticator]") - end - end - - context "when compute fleet with cfn-hup disabled on fleet" do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - node.override['cluster']['node_type'] = 'ComputeFleet' - node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' - end - runner.converge(described_recipe) - end - cached(:node) { chef_run.node } - - it 'has the correct content' do - is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') - .with_content("[program:computemgtd]") is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') .with_content("[program:cfn-hup]") end end + context "when login node and dcv configured" do cached(:chef_run) do runner = runner(platform: platform, version: version) do |node| @@ -157,25 +119,6 @@ .with_content("[program:pcluster_dcv_authenticator]") end end - - context "when login node with cfn-hup disabled on fleet" do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - node.override['cluster']['node_type'] = 'LoginNode' - node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' - end - runner.converge(described_recipe) - end - cached(:node) { chef_run.node } - - it 'has the correct content' do - is_expected.to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') - .with_content("[program:loginmgtd]") - - is_expected.not_to render_file('/etc/parallelcluster/parallelcluster_supervisord.conf') - .with_content("[program:cfn-hup]") - end - end end end end diff --git a/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb b/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb index e98a755235..bb7fb1f1a5 100644 --- a/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb +++ b/cookbooks/aws-parallelcluster-platform/templates/supervisord/parallelcluster_supervisord.conf.erb @@ -1,7 +1,7 @@ # Generated by Chef for AWS ParallelCluster <%= node['cluster']['node_type'] -%> # Local modifications could be overwritten. -<%# HeadNode, ComputeFleet, LoginNode -%> -<% if @cfnhup_enabled -%> +<% case node['cluster']['node_type'] -%> +<% when 'HeadNode', 'LoginNode' -%> [program:cfn-hup] command = <%= node['cluster']['scripts_dir']%>/cfn-hup-runner.sh autorestart = true diff --git a/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb b/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb index 4bbfffd820..b63573f48c 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/cluster.rb @@ -9,6 +9,10 @@ default['cluster']['log_base_dir'] = '/var/log/parallelcluster' default['cluster']['etc_dir'] = '/etc/parallelcluster' +# Shared file used to manage inplace updates +default['cluster']['shared_update_path'] = "#{node['cluster']['shared_dir']}/check_update" +default['cluster']['update_checkpoint'] = "#{node['cluster']['scripts_dir']}/update_checkpoint" + # Slurm_plugin_dir is used by slurm cookbook and custom_actions recipe default['cluster']['slurm_plugin_dir'] = "#{node['cluster']['etc_dir']}/slurm_plugin" @@ -34,6 +38,3 @@ # Default NFS mount options default['cluster']['nfs']['hard_mount_options'] = 'hard,_netdev,noatime' - -# Cluster Updates -default['cluster']['in_place_update_on_fleet_enabled'] = 'true' diff --git a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb index 37e0114051..ce3a27532f 100644 --- a/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb +++ b/cookbooks/aws-parallelcluster-shared/libraries/helpers.rb @@ -106,14 +106,3 @@ def wait_sync_file(path) timeout 5 end end - -def cfnhup_enabled? - # cfn-hup is always enabled on the head node, as it is required to perform cluster updates. - # cfn-hup can be disabled on compute nodes and login nodes, limiting the cluster update in the sense that - # live updates on compute and login nodes are not possible. - node['cluster']['node_type'] == 'HeadNode' || node['cluster']['in_place_update_on_fleet_enabled'] == 'true' -end - -def cluster_readiness_check_on_update_enabled? - node['cluster']['in_place_update_on_fleet_enabled'] == 'true' -end diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb deleted file mode 100644 index e9d180d5e0..0000000000 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/helpers_spec.rb +++ /dev/null @@ -1,42 +0,0 @@ -require_relative '../../../libraries/helpers' -require 'spec_helper' - -describe 'cfnhup_enabled?' do - let(:node) { Chef::Node.new } - - context 'when node type is HeadNode' do - before { node.override['cluster']['node_type'] = 'HeadNode' } - - it 'returns true regardless of in_place_update_on_fleet_enabled setting' do - node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' - expect(cfnhup_enabled?).to be true - end - end - - %w(ComputeFleet LoginNode).each do |node_type| - context "when node type is #{node_type}" do - before { node.override['cluster']['node_type'] = node_type } - - it 'returns true when in_place_update_on_fleet_enabled is true' do - node.override['cluster']['in_place_update_on_fleet_enabled'] = 'true' - expect(cfnhup_enabled?).to be true - end - - it 'returns false when in_place_update_on_fleet_enabled is false' do - node.override['cluster']['in_place_update_on_fleet_enabled'] = 'false' - expect(cfnhup_enabled?).to be false - end - end - end -end - -describe 'cluster_readiness_check_on_update_enabled?' do - let(:node) { Chef::Node.new } - - [true, false].each do |in_place_update_on_fleet_enabled| - it "returns #{in_place_update_on_fleet_enabled} when in_place_update_on_fleet_enabled is #{in_place_update_on_fleet_enabled}" do - node.override['cluster']['in_place_update_on_fleet_enabled'] = in_place_update_on_fleet_enabled.to_s - expect(cluster_readiness_check_on_update_enabled?).to be in_place_update_on_fleet_enabled - end - end -end diff --git a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb index 4aabeaa21c..ca3b789970 100644 --- a/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb +++ b/cookbooks/aws-parallelcluster-slurm/recipes/update/update_head_node.rb @@ -20,6 +20,14 @@ not_if { ::File.exist?(node['cluster']['previous_cluster_config_path']) && !are_queues_updated? && !are_bulk_custom_slurm_settings_updated? } end +# Write the new config version to shared storage to signal compute nodes to update +file node['cluster']['shared_update_path'] do + content node['cluster']['cluster_config_version'] + owner 'root' + group 'root' + mode '0644' +end + ruby_block "update_shared_storages" do block do run_context.include_recipe 'aws-parallelcluster-environment::update_shared_storages' @@ -272,7 +280,7 @@ def update_nodes_in_queue(strategy, queues) chef_sleep '15' -wait_cluster_ready if cluster_readiness_check_on_update_enabled? +wait_cluster_ready execute 'start clustermgtd' do command "#{cookbook_virtualenv_path}/bin/supervisorctl start clustermgtd" diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb index c609db63cd..1c20519482 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/recipes/update_head_node_spec.rb @@ -15,7 +15,6 @@ allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(are_mount_or_unmount_required) allow_any_instance_of(Object).to receive(:dig).and_return(true) allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path) - allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(true) RSpec::Mocks.configuration.allow_message_expectations_on_nil = true node.override['cluster']['stack_name'] = cluster_name @@ -34,6 +33,15 @@ ) end + it 'writes the config version to shared update file' do + is_expected.to create_file(chef_run.node['cluster']['shared_update_path']).with( + content: cluster_config_version, + owner: 'root', + group: 'root', + mode: '0644' + ) + end + if are_mount_or_unmount_required it 'updates the shared storage' do is_expected.to run_ruby_block("update_shared_storages") @@ -65,27 +73,6 @@ end end end - - context 'when cluster readiness check is disabled' do - cached(:chef_run) do - runner = runner(platform: platform, version: version) do |node| - allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false) - allow_any_instance_of(Object).to receive(:dig).and_return(true) - allow_any_instance_of(Object).to receive(:cookbook_virtualenv_path).and_return(cookbook_venv_path) - allow_any_instance_of(Object).to receive(:cluster_readiness_check_on_update_enabled?).and_return(false) - RSpec::Mocks.configuration.allow_message_expectations_on_nil = true - - node.override['cluster']['stack_name'] = cluster_name - node.override['cluster']['region'] = region - node.override['cluster']['cluster_config_version'] = cluster_config_version - node.override['cluster']['scripts_dir'] = scripts_dir - end - runner.converge(described_recipe) - end - it 'does not check cluster readiness' do - is_expected.not_to run_execute("Check cluster readiness") - end - end end end end