2
0
mirror of https://github.com/xcat2/xcat-core.git synced 2025-06-18 20:30:56 +00:00

Three step hpm update for Firestone machines

This commit is contained in:
Mark Gurevich
2017-04-28 12:20:23 -04:00
parent f7a2a96d46
commit 6ba15adab1

View File

@ -1762,6 +1762,56 @@ sub check_bmc_status_with_ipmitool {
return 0;
}
#----------------------------------------------------------------#
# Wait for OS to reboot by checking "nodestat" to be "sshd" :
# Arguments:
# initial_sleep: seconds to sleep before checking loop start
# inteval: inteval time to check
# retry: max retry time
# sessdata: session data for display
# verbose: verbose output
# Returns:
# 1 when OS is up
# 0 when no response of "sshd" from OS
#----------------------------------------------------------------#
sub wait_for_os_to_reboot {
my $initial_sleep = shift;
my $interval = shift;
my $retry = shift;
my $sessdata = shift;
my $verbose = shift;
my $cmd;
my $output;
if ($verbose) {
xCAT::SvrUtils::sendmsg("Sleeping for a few min waiting for node to power on before attempting to continue", $callback, $sessdata->{node}, %allerrornodes);
}
sleep($initial_sleep); # sleep for this many min for node to reboot
# Start testing every 10 sec for node to be booted. Give up after 5 min.
foreach (1..$retry) {
# Test node is booted in to OS
$cmd = "nodestat $sessdata->{node} | /usr/bin/grep sshd";
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC == 0) {
# Node is ready to retry an upgrage
if ($verbose) {
xCAT::SvrUtils::sendmsg("Detected node booted. Will retry upgrade", $callback, $sessdata->{node}, %allerrornodes);
}
return 1; #Node booted
}
else {
# Still not booted, wait for 10 sec and try again
if ($verbose) {
$cmd = "nodestat $sessdata->{node}";
$output = xCAT::Utils->runcmd($cmd, -1);
my ($nodename, $state) = split(/:/, $output);
xCAT::SvrUtils::sendmsg("($_) Node still not ready. Current state - $state, test again in 10 sec.", $callback, $sessdata->{node}, %allerrornodes);
}
sleep($interval);
}
}
return 0; #Node did not boot after requested delay
}
sub do_firmware_update {
my $sessdata = shift;
my $ret;
@ -1769,6 +1819,9 @@ sub do_firmware_update {
my $verbose = 0;
my $retry = 2;
my $verbose_opt;
my $is_firestone = 0;
my $firestone_update_version;
my $htm_update_version;
$ret = get_ipmitool_version(\$ipmitool_ver);
exit $ret if $ret < 0;
@ -1789,7 +1842,7 @@ sub do_firmware_update {
my $exit_with_success_func = sub {
my ($node, $callback, $message) = @_;
my $status = "success to update firmware";
my $status = "success updating firmware";
my $nodelist_table = xCAT::Table->new('nodelist');
if (!$nodelist_table) {
xCAT::MsgUtils->message("S", "Unable to open nodelist table, denying");
@ -1883,8 +1936,6 @@ sub do_firmware_update {
}
}
xCAT::SvrUtils::sendmsg("rflash started, upgrade failure will be retried up to $retry times. Please wait...",
$callback, $sessdata->{node}, %allerrornodes);
# check for 8335-GTB Firmware above 1610A release. If below, exit
if ($output =~ /8335-GTB/) {
@ -1903,6 +1954,69 @@ sub do_firmware_update {
"Error: Current firmware level OP8v_$grs_version requires one-time manual update to at least version OP8v_1.7_2.55");
}
}
}
# For Firestone the update from 810 to 820 or from 820 to 810 needs to be done in 3 steps
# instead of usual one.
if ($output =~ /8335-GCA|8335-GTA/) {
$is_firestone = 1;
$cmd = $pre_cmd . " fru print 47";
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC != 0) {
$exit_with_error_func->($sessdata->{node}, $callback,
"Running ipmitool command $cmd failed: $output");
}
# Check what firmware version is currently running on the machine
if ($output =~ /OP8_v\d\.\d+_(\d+)\.\d+/) {
my $frs_version = $1;
if ($frs_version == 1) {
$firestone_update_version = "810";
}
if ($frs_version == 2) {
$firestone_update_version = "820";
}
}
else {
$exit_with_error_func->($sessdata->{node}, $callback,
"Unable to determine firmware version currently installed.");
}
# Check what firmware version is specified in htm file
$cmd = "/usr/bin/grep -a FW_DESC $hpm_file";
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC != 0) {
$exit_with_error_func->($sessdata->{node}, $callback,
"Running ipmitool command $cmd failed: $output");
}
# Parse out build date from the description string
if ($output =~ /FW_DESC=8335 \w+ \w+ \w+ (\w+)/) {
my $htm_date= $1;
# Parse out the year from "mmddyyyy" (skip 4 digits, grab last 4)
if ($htm_date =~ /\d{4}(\d+)/) {
my $htm_year = $1;
if ($htm_year == 2016) {
$htm_update_version = "810";
}
if ($htm_year == 2017) {
$htm_update_version = "820";
}
}
}
else {
$exit_with_error_func->($sessdata->{node}, $callback,
"Unable to determine firmware version of $hpm_file.");
}
}
if ($is_firestone and
(($firestone_update_version eq "820" and $htm_update_version eq "810") or
($firestone_update_version eq "810" and $htm_update_version eq "820"))
) {
xCAT::SvrUtils::sendmsg("rflash started, Please wait...", $callback, $sessdata->{node}, %allerrornodes);
$retry = 0; # No retry support for 3 step update process
}
else {
xCAT::SvrUtils::sendmsg("rflash started, upgrade failure will be retried up to $retry times. Please wait...", $callback, $sessdata->{node}, %allerrornodes);
}
RETRY_UPGRADE:
@ -1942,38 +2056,123 @@ RETRY_UPGRADE:
}
# step 4 upgrade firmware
$cmd = $pre_cmd . " -z " . $buffer_size . " hpm upgrade $hpm_file force ";
$cmd .= $verbose_opt;
my $rflash_log_file = xCAT::Utils->full_path($sessdata->{node}.".log", RFLASH_LOG_DIR);
$cmd .= " >".$rflash_log_file." 2>&1";
if ($verbose) {
xCAT::SvrUtils::sendmsg([ 0,
"rflashing ... See the detail progress :\"tail -f $rflash_log_file\"" ],
$callback, $sessdata->{node});
}
# For firestone machines if updating from 810 to 820 version or from 820 to 810,
# extra steps are needed. Hanled in "if" block, "else" block is normal update in a single step.
if ($is_firestone and
(($firestone_update_version eq "820" and $htm_update_version eq "810") or
($firestone_update_version eq "810" and $htm_update_version eq "820"))
) {
$output = xCAT::Utils->runcmd($cmd, -1);
# if upgrade command failed and we exausted number of retries
# report an error, exit to the caller and leave node in powered off state
# otherwise report an error, power on the node and try upgrade again
if ($::RUNCMD_RC != 0) {
# Since "hpm update" command in step 4 above redirects standard out and error to a log file,
# nothing gets returned from execution of the command. Here if RC is not zero, we
# extract all lines containing "Error" from that log file and display them in the sendmsg below
my $get_error_cmd = "/usr/bin/grep Error $rflash_log_file";
$output = xCAT::Utils->runcmd($get_error_cmd, 0);
if ($retry == 0) {
# No more retries left, report an error and exit
# Step 4.1
$cmd = $pre_cmd . " -z " . $buffer_size . " hpm upgrade $hpm_file component 0 force ";
$cmd .= $verbose_opt;
my $rflash_log_file = xCAT::Utils->full_path($sessdata->{node}.".log", RFLASH_LOG_DIR);
$cmd .= " >".$rflash_log_file." 2>&1";
if ($verbose) {
xCAT::SvrUtils::sendmsg([ 0,
"rflashing component 0, see the detail progress :\"tail -f $rflash_log_file\"" ],
$callback, $sessdata->{node});
}
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC != 0) {
# Since "hpm update" command in step 4.1 above redirects standard out and error to a log file,
# nothing gets returned from execution of the command. Here if RC is not zero, we
# extract all lines containing "Error" from that log file and display them in the sendmsg below
my $get_error_cmd = "/usr/bin/grep Error $rflash_log_file";
$output = xCAT::Utils->runcmd($get_error_cmd, 0);
$exit_with_error_func->($sessdata->{node}, $callback,
"Running ipmitool command $cmd failed with rc=$::RUNCMD_RC and the following error messages:\n$output\nSee the $rflash_log_file for details.");
}
else {
# Error upgrading, set a flag to attempt a retry
xCAT::SvrUtils::sendmsg("Running attempt $retry of ipmitool command $cmd failed with rc=$::RUNCMD_RC and the following error messages:\n$output\nSee the $rflash_log_file for details.", $callback, $sessdata->{node}, %allerrornodes);
$failed_upgrade = 1;
sleep(1); # Sleep for a second before next step
# Step 4.2
$cmd = $pre_cmd . " -z " . $buffer_size . " hpm upgrade $hpm_file component 1 force ";
$cmd .= $verbose_opt;
my $rflash_log_file = xCAT::Utils->full_path($sessdata->{node}.".log", RFLASH_LOG_DIR);
$cmd .= " >>".$rflash_log_file." 2>&1";
if ($verbose) {
xCAT::SvrUtils::sendmsg([ 0,
"rflashing component 1, see the detail progress :\"tail -f $rflash_log_file\"" ],
$callback, $sessdata->{node});
}
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC != 0) {
# Since "hpm update" command in step 4.2 above redirects standard out and error to a log file,
# nothing gets returned from execution of the command. Here if RC is not zero, we
# extract all lines containing "Error" from that log file and display them in the sendmsg below
my $get_error_cmd = "/usr/bin/grep Error $rflash_log_file";
$output = xCAT::Utils->runcmd($get_error_cmd, 0);
$exit_with_error_func->($sessdata->{node}, $callback,
"Running ipmitool command $cmd failed with rc=$::RUNCMD_RC and the following error messages:\n$output\nSee the $rflash_log_file for details.");
}
# Wait for BMC to reboot before continuing to next step
unless (check_bmc_status_with_ipmitool($pre_cmd, 5, 60, 10, $sessdata, $verbose)) {
$exit_with_error_func->($sessdata->{node}, $callback,
"Timeout waiting for the bmc ready status. Firmware update suspended");
}
# Step 4.3
$cmd = $pre_cmd . " -z " . $buffer_size . " hpm upgrade $hpm_file component 2 force ";
$cmd .= $verbose_opt;
my $rflash_log_file = xCAT::Utils->full_path($sessdata->{node}.".log", RFLASH_LOG_DIR);
$cmd .= " >>".$rflash_log_file." 2>&1";
if ($verbose) {
xCAT::SvrUtils::sendmsg([ 0,
"rflashing component 2, see the detail progress :\"tail -f $rflash_log_file\"" ],
$callback, $sessdata->{node});
}
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC != 0) {
# Since "hpm update" command in step 4.3 above redirects standard out and error to a log file,
# nothing gets returned from execution of the command. Here if RC is not zero, we
# extract all lines containing "Error" from that log file and display them in the sendmsg below
my $get_error_cmd = "/usr/bin/grep Error $rflash_log_file";
$output = xCAT::Utils->runcmd($get_error_cmd, 0);
$exit_with_error_func->($sessdata->{node}, $callback,
"Running ipmitool command $cmd failed with rc=$::RUNCMD_RC and the following error messages:\n$output\nSee the $rflash_log_file for details.");
}
}
else {
$cmd = $pre_cmd . " -z " . $buffer_size . " hpm upgrade $hpm_file force ";
$cmd .= $verbose_opt;
my $rflash_log_file = xCAT::Utils->full_path($sessdata->{node}.".log", RFLASH_LOG_DIR);
$cmd .= " >".$rflash_log_file." 2>&1";
if ($verbose) {
xCAT::SvrUtils::sendmsg([ 0,
"rflashing ... See the detail progress :\"tail -f $rflash_log_file\"" ],
$callback, $sessdata->{node});
}
$output = xCAT::Utils->runcmd($cmd, -1);
# if upgrade command failed and we exausted number of retries
# report an error, exit to the caller and leave node in powered off state
# otherwise report an error, power on the node and try upgrade again
if ($::RUNCMD_RC != 0) {
# Since "hpm update" command in step 4 above redirects standard out and error to a log file,
# nothing gets returned from execution of the command. Here if RC is not zero, we
# extract all lines containing "Error" from that log file and display them in the sendmsg below
my $get_error_cmd = "/usr/bin/grep Error $rflash_log_file";
$output = xCAT::Utils->runcmd($get_error_cmd, 0);
if ($retry == 0) {
# No more retries left, report an error and exit
$exit_with_error_func->($sessdata->{node}, $callback,
"Running ipmitool command $cmd failed with rc=$::RUNCMD_RC and the following error messages:\n$output\nSee the $rflash_log_file for details.");
}
else {
# Error upgrading, set a flag to attempt a retry
xCAT::SvrUtils::sendmsg("Running attempt $retry of ipmitool command $cmd failed with rc=$::RUNCMD_RC and the following error messages:\n$output\nSee the $rflash_log_file for details.", $callback, $sessdata->{node}, %allerrornodes);
$failed_upgrade = 1;
}
}
}
}
# step 5 power on
@ -1999,34 +2198,9 @@ RETRY_UPGRADE:
"Running ipmitool command $cmd failed: $output");
}
my $node_ready_for_retry = 0;
if ($failed_upgrade) {
# Update has failed in step 4. Wait for node to reboot and try again
if ($verbose) {
xCAT::SvrUtils::sendmsg("Sleeping for a few min waiting for node to power on before attempting a retry", $callback, $sessdata->{node}, %allerrornodes);
}
sleep(300); # sleep for 5 min for node to reboot
# Start testing every 10 sec for node to be booted. Give up after 5 min.
foreach (1..30) {
# Test node is booted in to OS
$cmd = "nodestat $sessdata->{node} | grep sshd";
$output = xCAT::Utils->runcmd($cmd, -1);
if ($::RUNCMD_RC == 0) {
# Node is ready to retry an upgrage
if ($verbose) {
xCAT::SvrUtils::sendmsg("Detected node booted. Will retry upgrade", $callback, $sessdata->{node}, %allerrornodes);
}
$node_ready_for_retry = 1;
last;
}
else {
# Still not booted, wait for 10 sec and try again
if ($verbose) {
xCAT::SvrUtils::sendmsg("Node still not ready, Test again in 10 sec.", $callback, $sessdata->{node}, %allerrornodes);
}
sleep(10);
}
}
my $node_ready_for_retry = wait_for_os_to_reboot(300,10,30,$sessdata,$verbose);
if ($node_ready_for_retry) {
$retry--; # decrement number of retries left
# Yes, it is a goto statement here. Ugly, but removes the need to restructure