From ffd203eeaf85abd291984867b0deb7727b3d655f Mon Sep 17 00:00:00 2001 From: Mark Gurevich Date: Wed, 12 Apr 2017 14:03:18 -0400 Subject: [PATCH 1/4] Add --retry option to rflash command --- xCAT-server/lib/xcat/plugins/ipmi.pm | 92 +++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 9 deletions(-) diff --git a/xCAT-server/lib/xcat/plugins/ipmi.pm b/xCAT-server/lib/xcat/plugins/ipmi.pm index ed0bc59ab..dbf87b666 100644 --- a/xCAT-server/lib/xcat/plugins/ipmi.pm +++ b/xCAT-server/lib/xcat/plugins/ipmi.pm @@ -1766,6 +1766,7 @@ sub do_firmware_update { my $ret; my $ipmitool_ver; my $verbose = 0; + my $retry = 2; my $verbose_opt; $ret = get_ipmitool_version(\$ipmitool_ver); exit $ret if $ret < 0; @@ -1834,8 +1835,6 @@ sub do_firmware_update { if ($bmc_password) { $pre_cmd = $pre_cmd . " -P $bmc_password"; } - xCAT::SvrUtils::sendmsg("rflash started, please wait.......", - $callback, $sessdata->{node}, %allerrornodes); # check for 8335-GTB Model Type to adjust buffer size my $buffer_size = "30000"; @@ -1849,7 +1848,7 @@ sub do_firmware_update { $buffer_size = "15000"; } - # check verbose and buffersize options + # check verbose, buffersize, and retry options for my $opt (@{$sessdata->{'extraargs'}}) { if ($opt =~ /-V{1,4}/) { $verbose_opt = lc($opt); @@ -1862,8 +1861,18 @@ sub do_firmware_update { $buffer_size = $buffer_value; } } + if ($opt =~ /retry=/) { + my ($attribute, $retry_value) = split(/=/, $opt); + if ($retry_value) { + # retry option was passed in, reset the default + $retry = $retry_value; + } + } } + xCAT::SvrUtils::sendmsg("rflash started, individual node upgrade failure will be retried upto $retry times. Please wait.......", + $callback, $sessdata->{node}, %allerrornodes); + # check for 8335-GTB Firmware above 1610A release. If below, exit if ($output =~ /8335-GTB/) { $cmd = $pre_cmd . " fru print 47"; @@ -1883,6 +1892,9 @@ sub do_firmware_update { } } +RETRY_UPGRADE: + my $failed_upgrade = 0; + # step 1 power off $cmd = $pre_cmd . " chassis power off"; xCAT::SvrUtils::sendmsg("Preparing to upgrade firmware, powering chassis off...", $callback, $sessdata->{node}, %allerrornodes); @@ -1925,9 +1937,21 @@ sub do_firmware_update { $callback, $sessdata->{node}); $output = xCAT::Utils->runcmd($cmd, -1); + # if upgrade command failed and we exausted number of retries + # report an error, exit to the caller and leave node in powered off state + # otherwise report an error, power on the node and try upgrade again if ($::RUNCMD_RC != 0) { - $exit_with_error_func->($sessdata->{node}, $callback, - "Running ipmitool command $cmd failed: $output"); + if ($retry == 0) { + # No more retries left, report and error and exit + $exit_with_error_func->($sessdata->{node}, $callback, + "Running ipmitool command $cmd failed: $output"); + } + else { + # Error upgrading, set a flag to attempt a retry + xCAT::SvrUtils::sendmsg("Running ipmitool command $cmd failed: $output", $callback, $sessdata->{node}, %allerrornodes); + $failed_upgrade = 1; + + } } # step 5 power on @@ -1937,15 +1961,65 @@ sub do_firmware_update { "Timeout to check the bmc status"); } - xCAT::SvrUtils::sendmsg("Firmware updated, powering chassis on to populate FRU information...", $callback, $sessdata->{node}, %allerrornodes); + if ($failed_upgrade) { + xCAT::SvrUtils::sendmsg("Firmware update failed, powering chassis on for a retry. This can take several minutes. $retry retries left ...", $callback, $sessdata->{node}, %allerrornodes); + } + else { + xCAT::SvrUtils::sendmsg("Firmware updated, powering chassis on to populate FRU information...", $callback, $sessdata->{node}, %allerrornodes); + } + $cmd = $pre_cmd . " chassis power on"; $output = xCAT::Utils->runcmd($cmd, -1); if ($::RUNCMD_RC != 0) { $exit_with_error_func->($sessdata->{node}, $callback, "Running ipmitool command $cmd failed: $output"); } - $exit_with_success_func->($sessdata->{node}, $callback, - "Success updating firmware."); + + my $node_ready_for_retry = 0; + if ($failed_upgrade) { + # Update has failed in step 4. Wait for node to reboot and try again + if ($verbose) { + xCAT::SvrUtils::sendmsg("Sleeping for a few min waiting for node to power on before attempting a retry", $callback, $sessdata->{node}, %allerrornodes); + } + sleep(300); # sleep for 5 min for node to reboot + # Start testing every 10 sec for node to be booted. Give up after 5 min. + foreach (1..30) { + # Test node is booted in to OS + $cmd = "nodestat $sessdata->{node} | grep sshd"; + $output = xCAT::Utils->runcmd($cmd, -1); + if ($::RUNCMD_RC == 0) { + # Node is ready to retry an upgrage + if ($verbose) { + xCAT::SvrUtils::sendmsg("Detected node booted. Will retry upgrade", $callback, $sessdata->{node}, %allerrornodes); + } + $node_ready_for_retry = 1; + last; + } + else { + # Still not booted, wait for 10 sec and try again + if ($verbose) { + xCAT::SvrUtils::sendmsg("Node still not ready, Test again in 10 sec.", $callback, $sessdata->{node}, %allerrornodes); + } + sleep(10); + } + } + if ($node_ready_for_retry) { + $retry--; # decrement number of retries left + # Yes, it is a goto statement here. Ugly, but removes the need to restructure + # the above block of code. + goto RETRY_UPGRADE; + } + else { + # After 10 min of waiting node has not rebooted. Give up retrying. + $exit_with_error_func->($sessdata->{node}, $callback, + "Giving up waiting for the node to reboot. No firther upgrade retries will be attempted."); + } + + } + else { + $exit_with_success_func->($sessdata->{node}, $callback, + "Success updating firmware."); + } } sub rflash { @@ -1958,7 +2032,7 @@ sub rflash { if ($opt =~ /^(-c|--check)$/i) { $sessdata->{subcommand} = "check"; # support verbose options for ipmitool command - } elsif ($opt !~ /.*\.hpm$/i && $opt !~ /^-V{1,4}$/ && $opt !~ /^--buffersize=/) { + } elsif ($opt !~ /.*\.hpm$/i && $opt !~ /^-V{1,4}$/ && $opt !~ /^--buffersize=/ && $opt !~ /^--retry=/) { $callback->({ error => "The option $opt is not supported", errorcode => 1 }); return; From 5ac200a363445fc2e9745b6e81dbb5c8751cd743 Mon Sep 17 00:00:00 2001 From: Mark Gurevich Date: Wed, 12 Apr 2017 14:44:20 -0400 Subject: [PATCH 2/4] Add --retry flag to rflash man page --- .../guides/admin-guides/references/man1/rflash.1.rst | 8 +++++++- xCAT-client/pods/man1/rflash.1.pod | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/source/guides/admin-guides/references/man1/rflash.1.rst b/docs/source/guides/admin-guides/references/man1/rflash.1.rst index 0945ec619..47b35ae06 100644 --- a/docs/source/guides/admin-guides/references/man1/rflash.1.rst +++ b/docs/source/guides/admin-guides/references/man1/rflash.1.rst @@ -50,7 +50,7 @@ OpenPOWER BMC specific: ======================= -\ **rflash**\ \ *noderange*\ \ *hpm_file_path*\ [\ **-c | -**\ **-check**\ ] [\ **-V**\ ] +\ **rflash**\ \ *noderange*\ \ *hpm_file_path*\ [\ **-c | -**\ **-check**\ ] [\ **-**\ **-retry=**\ \ *count*\ ] [\ **-V**\ ] @@ -174,6 +174,12 @@ The command will update firmware for OpenPOWER BMC when given an OpenPOWER node +\ **-**\ **-retry=**\ \ *count*\ + + Specify number of times to retry the update if failure is detected. Default value is 2. Value of 0 can be used to indicate no retries. + + + \ **-v|-**\ **-version**\ Displays the command's version. diff --git a/xCAT-client/pods/man1/rflash.1.pod b/xCAT-client/pods/man1/rflash.1.pod index 2a02fc315..de1f11e60 100644 --- a/xCAT-client/pods/man1/rflash.1.pod +++ b/xCAT-client/pods/man1/rflash.1.pod @@ -24,7 +24,7 @@ B I I =head2 OpenPOWER BMC specific: -B I I [B<-c>|B<--check>] [B<-V>] +B I I [B<-c>|B<--check>] [B<--retry=>I] [B<-V>] =head1 B @@ -114,6 +114,10 @@ Used to commit the flash image in the temporary side of the chip to the permanen Used to recover the flash image in the permanent side of the chip to the temporary side for both managed systems and power subsystems. +=item B<--retry=>I + +Specify number of times to retry the update if failure is detected. Default value is 2. Value of 0 can be used to indicate no retries. + =item B<-v|--version> Displays the command's version. From 998a9e82e2453783c0fee96e5f6bb998533620a8 Mon Sep 17 00:00:00 2001 From: Mark Gurevich Date: Wed, 12 Apr 2017 15:16:47 -0400 Subject: [PATCH 3/4] Message fixes suggested in review comments --- xCAT-server/lib/xcat/plugins/ipmi.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xCAT-server/lib/xcat/plugins/ipmi.pm b/xCAT-server/lib/xcat/plugins/ipmi.pm index dbf87b666..16fa35f48 100644 --- a/xCAT-server/lib/xcat/plugins/ipmi.pm +++ b/xCAT-server/lib/xcat/plugins/ipmi.pm @@ -1870,7 +1870,7 @@ sub do_firmware_update { } } - xCAT::SvrUtils::sendmsg("rflash started, individual node upgrade failure will be retried upto $retry times. Please wait.......", + xCAT::SvrUtils::sendmsg("rflash started, upgrade failure will be retried up to $retry times. Please wait...", $callback, $sessdata->{node}, %allerrornodes); # check for 8335-GTB Firmware above 1610A release. If below, exit @@ -2012,7 +2012,7 @@ RETRY_UPGRADE: else { # After 10 min of waiting node has not rebooted. Give up retrying. $exit_with_error_func->($sessdata->{node}, $callback, - "Giving up waiting for the node to reboot. No firther upgrade retries will be attempted."); + "Giving up waiting for the node to reboot. No further retries will be attempted."); } } From 21a9d1499c15a4478aa2b8b96e79302df452a255 Mon Sep 17 00:00:00 2001 From: hu-weihua Date: Wed, 12 Apr 2017 21:41:41 -0400 Subject: [PATCH 4/4] Add timestamp for every cmd log in xcatteat log --- xCAT-test/xcattest | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xCAT-test/xcattest b/xCAT-test/xcattest index 541fad827..adb91d428 100755 --- a/xCAT-test/xcattest +++ b/xCAT-test/xcattest @@ -907,7 +907,8 @@ sub runcase #by my $runstart = timelocal(localtime()); - log_this("\nRUN:$cmd"); + my $runstartstr = scalar(localtime()); + log_this("\nRUN:$cmd [$runstartstr]"); push(@record, "\nRUN:$cmd"); @output = &runcmd($cmd); $rc = $::RUNCMD_RC;