mirror of
https://github.com/xcat2/xcat-core.git
synced 2025-08-20 18:20:27 +00:00
1007 lines
35 KiB
Perl
Executable File
1007 lines
35 KiB
Perl
Executable File
#! /usr/bin/perl
|
|
# IBM(c) 2016 EPL license http://www.eclipse.org/legal/epl-v10.html
|
|
|
|
BEGIN { $::XCATROOT = $ENV{'XCATROOT'} ? $ENV{'XCATROOT'} : -d '/opt/xcat' ? '/opt/xcat' : '/usr'; }
|
|
|
|
use lib "$::XCATROOT/probe/lib/perl";
|
|
use probe_utils;
|
|
use LogParse;
|
|
use probe_global_constant;
|
|
use xCAT::NetworkUtils;
|
|
use File::Basename;
|
|
use IO::Select;
|
|
use Time::Local;
|
|
use Data::Dumper;
|
|
use Getopt::Long qw(:config no_ignore_case);
|
|
|
|
#---------------------------------------------
|
|
# Global attributes
|
|
#---------------------------------------------
|
|
|
|
#-------MAC to IP/node map list-------
|
|
# $macmap{mac_addr}{"ip"}="x.x.x.x"
|
|
# $macmap{mac_addr}{"node"}="nodename"
|
|
#-------------------------------------
|
|
my %macmap;
|
|
|
|
#------IP to node map list-------
|
|
# $ipnodemap{ip_addr}="nodename"
|
|
#--------------------------------
|
|
my %ipnodemap;
|
|
|
|
#The NIC of provision network
|
|
my $installnic;
|
|
|
|
#Used by customer or developer, to obtain more output information
|
|
my $verbose = 0;
|
|
|
|
#if no specific instruction, do monitor by default
|
|
my $monitor = 1;
|
|
|
|
#used by developer, to debug the detail information about function running
|
|
my $debug = 0;
|
|
|
|
#---------------------------------------------
|
|
# Command Usage
|
|
#---------------------------------------------
|
|
my $program_name = basename("$0");
|
|
$::USAGE = "Usage:
|
|
$program_name -h
|
|
$program_name -n <node_range> [-t <max_waiting_time]> [-V]
|
|
$program_name -n <node_range> -r <roll_back_duration> [-V]
|
|
|
|
Description:
|
|
Probe operating system provision process. Support two modes, they are 'Realtime monitor' and 'Replay history'.
|
|
Realtime monitor: Used during doing provision, trigger 'Realtime monitor' before triggering target node reboot to do provision. This tool will monitor the provision state of node. If without specific instruction, do 'Realtime monitor' by default.
|
|
Replay history: Used after finish provision, to probe the provision happened some while ago.
|
|
|
|
[NOTE] Currently, hierarchial structure is not supported.
|
|
|
|
Options:
|
|
-h : Get usage information of $program_name
|
|
-V : Output more information
|
|
-n : The range of nodes to be monitored or replayed.
|
|
-t : The maximum time to wait when doing monitor, unit is minute. default is 60.
|
|
-r : Trigger 'Replay history' mode. follow the duration of rolling back. Unit are 'h' (hour) or 'm' (minute)
|
|
Supported formats look like 3h30m (3 hours and 30 minutes ago), 2h (2 hours ago), 40m (40 minutes ago) and 3 (3 hours ago).
|
|
If unit is not specified, hour will be used by default.
|
|
";
|
|
|
|
|
|
#----------------------------------------------
|
|
# Main process
|
|
#----------------------------------------------
|
|
|
|
# parse command line arguments
|
|
my $help = 0;
|
|
my $test = 0;
|
|
my $maxwaittime = 60; #unit is minute, the max wait time of monitor
|
|
my $rollforward_time_of_replay; #used by feature replay provision log
|
|
my $noderange;
|
|
if (
|
|
!GetOptions("--help|h|?" => \$help,
|
|
"T" => \$test,
|
|
"V" => \$verbose,
|
|
"t=s" => \$maxwaittime,
|
|
"r=s" => \$rollforward_time_of_replay,
|
|
"n=s" => \$noderange))
|
|
{
|
|
probe_utils->send_msg("stdout", "f", "Invalid parameter for $program_name");
|
|
probe_utils->send_msg("stdout", "", "$::USAGE");
|
|
exit 1;
|
|
}
|
|
|
|
if ($help) {
|
|
probe_utils->send_msg("stdout", "", "$::USAGE");
|
|
exit 0;
|
|
}
|
|
|
|
if ($test) {
|
|
probe_utils->send_msg("stdout", "o", "Probe for OS provision process, realtime monitor of OS provision process.");
|
|
exit 0;
|
|
}
|
|
|
|
unless ($noderange) {
|
|
probe_utils->send_msg("stdout", "f", "A noderange is required");
|
|
probe_utils->send_msg("stdout", "", "$::USAGE");
|
|
exit 1;
|
|
}
|
|
|
|
if ($rollforward_time_of_replay) {
|
|
if (($rollforward_time_of_replay !~ /(\d+)h(\d+)m/i) && ($rollforward_time_of_replay !~ /^(\d+)h*$/i) && ($rollforward_time_of_replay !~ /^(\d+)m$/i)) {
|
|
probe_utils->send_msg("stdout", "f", "Unsupported time format for option '-r'");
|
|
probe_utils->send_msg("stdout", "", "$::USAGE");
|
|
exit 1;
|
|
}
|
|
}
|
|
|
|
my $rst = do_pre_check($noderange);
|
|
if ($debug) {
|
|
print "Dumper macmap--------\n";
|
|
print Dumper \%macmap;
|
|
print "Dumper ipnodemap--------\n";
|
|
print Dumper \%ipnodemap;
|
|
print "installnic = $installnic\n";
|
|
}
|
|
|
|
#if failed to pass pre-check, exit directly
|
|
exit $rst if ($rst);
|
|
|
|
|
|
if ($rollforward_time_of_replay) {
|
|
$monitor = 0;
|
|
|
|
my $start_time_of_replay = time();
|
|
my $end_time_of_replay = $start_time_of_replay;
|
|
if ($rollforward_time_of_replay =~ /(\d+)h(\d+)m/i) {
|
|
$start_time_of_replay -= ($1 * 3600 + $2 * 60)
|
|
} elsif ($rollforward_time_of_replay =~ /^(\d+)h*$/i) {
|
|
$start_time_of_replay -= $1 * 3600;
|
|
} elsif ($rollforward_time_of_replay =~ /^(\d+)m$/) {
|
|
$start_time_of_replay -= $1 * 60;
|
|
}
|
|
|
|
$rst = do_replay($noderange, $start_time_of_replay, $end_time_of_replay);
|
|
exit $rst;
|
|
}
|
|
|
|
#if no specific instruction, do monitor by default
|
|
$rst = do_monitor($noderange, $maxwaittime);
|
|
|
|
exit $rst;
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Pre-check some static configuration. such as privision network interface, the definition of node
|
|
Arguments:
|
|
noderange: The range of node
|
|
Returns:
|
|
0: pass
|
|
1: failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub do_pre_check {
|
|
my $noderange = shift;
|
|
|
|
my $rst = 0; #the return value of current function
|
|
|
|
my @error = ();
|
|
my $sub_func_rst = obtain_install_nic(\$installnic, \@error);
|
|
if ($sub_func_rst) {
|
|
probe_utils->send_msg("stdout", "f", "Obtain install NIC in current server failed");
|
|
probe_utils->send_msg("stdout", "d", "$_") foreach (@error);
|
|
} else {
|
|
probe_utils->send_msg("stdout", "i", "The install NIC in current server is $installnic");
|
|
}
|
|
$rst |= $sub_func_rst;
|
|
|
|
@error = ();
|
|
$sub_func_rst = check_noderange($noderange, \@error);
|
|
if ($sub_func_rst) {
|
|
probe_utils->send_msg("stdout", "f", "There is something wrong in node definition");
|
|
probe_utils->send_msg("stdout", "d", "$_") foreach (@error);
|
|
} else {
|
|
probe_utils->send_msg("stdout", "o", "All nodes which will be deployed are valid");
|
|
}
|
|
$rst |= $sub_func_rst;
|
|
|
|
return $rst;
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Obtain the network interface of provision
|
|
Arguments:
|
|
installnic_ref: (output attribute) the reference of scalar which save the provision network interface.
|
|
return_error_ref: (output attribute) the reference of array which save the error information.
|
|
Returns:
|
|
0: success
|
|
1: failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub obtain_install_nic {
|
|
my $installnic_ref = shift;
|
|
my $return_error_ref = shift;
|
|
|
|
my $master_ip_in_site = `lsdef -t site -i master -c |awk -F"=" '{print \$2}'`;
|
|
chomp($master_ip_in_site);
|
|
unless ($master_ip_in_site) {
|
|
push @$return_error_ref, "'master' isn't configured in 'site' table";
|
|
return 1;
|
|
}
|
|
|
|
$$installnic_ref = `ip addr |grep -B2 $master_ip_in_site|awk -F" " '/mtu/{gsub(/:/,"",\$2); print \$2}'`;
|
|
chomp($$installnic_ref);
|
|
if (!$$installnic_ref) {
|
|
push @$return_error_ref, "The value of 'master' in 'site' table is $master_ip_in_site, can't get corresponding network interface in current server";
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Check if the definition of all nodes are valid
|
|
If all nodes definition are valid, initialize below global attributes which will be used as a map in "Monitor" or "Replay"
|
|
%macmap, %ipnodemap
|
|
|
|
Arguments:
|
|
node_range: node range
|
|
Returns:
|
|
0 : pass
|
|
1 : failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub check_noderange {
|
|
my $node_range = shift;
|
|
my $error_ref = shift;
|
|
|
|
my @cmdoutput = `lsdef $node_range -i ip,mac -c 2>&1`;
|
|
my $rst = 0;
|
|
my %nodecheckrst;
|
|
|
|
foreach (@cmdoutput) {
|
|
chomp($_);
|
|
$_ =~ s/^\s+|\s+$//g;
|
|
if ($_ =~ /^Error: Could not find an object named '(\w+)' .+/i) {
|
|
$rst = 1;
|
|
push @$error_ref, "Node $1 without node definition";
|
|
} elsif ($_ =~ /(\S+):\s+mac=(.*)/i) {
|
|
my $node = $1;
|
|
my $mac = $2;
|
|
if ($mac) {
|
|
$nodecheckrst{$node}{"mac"} = $mac;
|
|
} else {
|
|
$rst = 1;
|
|
push @$error_ref, "Node $node without 'mac' attribute definition";
|
|
}
|
|
} elsif ($_ =~ /(\S+):\s+ip=(.*)/i) {
|
|
my $node = $1;
|
|
my $ip = $2;
|
|
if ($ip) {
|
|
$nodecheckrst{$node}{"ip"} = $ip;
|
|
} else {
|
|
my $nodeip = xCAT::NetworkUtils->getipaddr($node);
|
|
if ($nodeip) {
|
|
$nodecheckrst{$node}{"ip"} = $nodeip;
|
|
} else {
|
|
$rst = 1;
|
|
push @$error_ref, "Node $node without ip definition";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!$rst) {
|
|
foreach my $node (keys %nodecheckrst) {
|
|
my @macs = split(/\|/, $nodecheckrst{$node}{"mac"});
|
|
foreach my $mac (@macs) {
|
|
|
|
#[NOTE] don't support 2 adapters in the same network now. TODO
|
|
|
|
if ($mac =~ /\!\*NOIP\*/) {
|
|
$mac =~ s/\!\*NOIP\*//g;
|
|
$macmap{$mac}{"ip"} = "NOIP";
|
|
$macmap{$mac}{"node"} = $node;
|
|
} else {
|
|
$macmap{$mac}{"node"} = $node;
|
|
$macmap{$mac}{"ip"} = $nodecheckrst{$node}{"ip"};
|
|
$ipnodemap{ $nodecheckrst{$node}{"ip"} } = $node;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $rst;
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Initailize a very important hash "%node_state" which will save the state information of every node
|
|
Arguments:
|
|
noderange: (input attribute) The range of node
|
|
node_state_ref: (output attribute) the reference of hash "%node_state"
|
|
The strucuture of hash "%node_state" are :
|
|
$node_state{<node>}{statehistory} Array. save the latest loop provision states
|
|
$node_state{<node>}{done} Scalar. the flag of if the node have finished the provision
|
|
$node_state{<node>}{allstatehistory} Array. save the history states before the latest loop provision. Used in debug mode.
|
|
$node_state{<node>}{log} Array. save all related logs of node. Used in debug mode.
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub init_node_state {
|
|
my $noderange = shift;
|
|
my $node_state_ref = shift;
|
|
|
|
my @nodes = probe_utils->parse_node_range($noderange);
|
|
foreach my $node (@nodes) {
|
|
@{ $node_state_ref->{$node}{statehistory} } = ();
|
|
$node_state_ref->{$node}{done} = 0;
|
|
}
|
|
}
|
|
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Implement the replay feature.
|
|
Arguments:
|
|
noderange : the range of node
|
|
start_time_of_replay: the start time point of scaning log
|
|
end_time_of_replay: the end time point of scaning log
|
|
|
|
Returns:
|
|
0: success
|
|
1: failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub do_replay {
|
|
my $noderange = shift;
|
|
my $start_time_of_replay = shift;
|
|
my $end_time_of_replay = shift;
|
|
|
|
my $rc = 0;
|
|
|
|
my $timestr = scalar(localtime($start_time_of_replay));
|
|
probe_utils->send_msg("stdout", "d", "Start to scan logs which are later than '$timestr', please waiting for a while.............");
|
|
|
|
my %node_state;
|
|
init_node_state($noderange, \%node_state);
|
|
if ($debug) {
|
|
print "Dumper node_state-------\n";
|
|
print Dumper \%node_state;
|
|
}
|
|
|
|
my $log_parse = LogParse->new($verbose);
|
|
my @candidate_mn_hostname_in_log = $log_parse->obtain_candidate_mn_hostname_in_log();
|
|
|
|
while ($start_time_of_replay < $end_time_of_replay) {
|
|
my @valid_one_second_log_set;
|
|
my $rst = $log_parse->obtain_one_second_logs($start_time_of_replay, \@valid_one_second_log_set);
|
|
if ($rst) {
|
|
probe_utils->send_msg("stdout", "d", "Obtain logs failed from log files");
|
|
$rc = 1;
|
|
last;
|
|
}
|
|
|
|
foreach my $log_ref (@valid_one_second_log_set) {
|
|
dispatch_log_to_handler($log_ref, \@candidate_mn_hostname_in_log, \%node_state);
|
|
}
|
|
|
|
$start_time_of_replay = $log_parse->obtain_next_second();
|
|
}
|
|
$log_parse->destory();
|
|
|
|
conclusion_report(\%node_state);
|
|
return $rc;
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Calculate the provision of every node. offer a report to customer
|
|
Arguments:
|
|
node_state_ref: The reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
0: success
|
|
1: failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub conclusion_report {
|
|
my $node_state_ref = shift;
|
|
|
|
probe_utils->send_msg("stdout", "", "==================conclusion_report=================");
|
|
|
|
if ($debug) {
|
|
print "---->the result of %node_state<------\n";
|
|
print Dumper $node_state_ref;
|
|
}
|
|
|
|
if ($verbose) {
|
|
probe_utils->send_msg("stdout", "d", "----------node history state----------");
|
|
foreach my $node (keys %$node_state_ref) {
|
|
my $allhistorystate;
|
|
my $historystate;
|
|
if (@{ $node_state_ref->{$node}{allstatehistory} }) {
|
|
$allhistorystate .= "$::STATE_DESC{$_} " foreach (@{ $node_state_ref->{$node}{allstatehistory} });
|
|
probe_utils->send_msg("stdout", "d", "[$node] the history before last loop provision: $allhistorystate");
|
|
}
|
|
|
|
$historystate .= "$::STATE_DESC{$_} " foreach (@{ $node_state_ref->{$node}{statehistory} });
|
|
probe_utils->send_msg("stdout", "d", "[$node] last loop provision history: $historystate");
|
|
}
|
|
probe_utils->send_msg("stdout", "d", "--------------------------------------");
|
|
}
|
|
|
|
my %failed_node;
|
|
foreach my $node (keys %$node_state_ref) {
|
|
my $stop_stage = 0;
|
|
|
|
my $start_rpower = 0;
|
|
my $power_on = 0;
|
|
my $isntalling = 0;
|
|
my $postbootscript = 0;
|
|
|
|
#calculate node provision result
|
|
#the max value of all state is the final stop stage
|
|
foreach (@{ $node_state_ref->{$node}{statehistory} }) {
|
|
$stop_stage = $_ if ($stop_stage < $_);
|
|
$start_rpower = 1 if ($_ == $::STATE_POWER_ON);
|
|
$power_on = 1 if ($_ == $::STATE_POWERINGON);
|
|
$isntalling = 1 if ($_ == $::STATE_INSTALLING);
|
|
$postbootscript = 1 if ($_ == $::STATE_POSTBOOTSCRIPT);
|
|
}
|
|
|
|
# Cover limited non-privision error
|
|
# 1 if xcatd receive reboot command to do provision (such like rpower, rnetboot, rinstall...) but the node status didn't change to "powering-on"
|
|
# that means reboot target node failed.
|
|
# 2 if power on target node successfully and there is 'running postbootscript' in node state history, but without "installing" state,
|
|
# It is very possible to just do reboot process
|
|
|
|
if ($start_rpower && !$power_on) {
|
|
$failed_node{$node}{non_provision_prediction} = "Trigger target node reboot failed";
|
|
} elsif ($start_rpower && $power_on && !$isntalling && $postbootscript) {
|
|
$failed_node{$node}{non_provision_prediction} = "Target node just reboot from disk";
|
|
} else {
|
|
if ($stop_stage != $::STATE_COMPLETED) {
|
|
$failed_node{$node}{provision_stop_point} = $stop_stage;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
if (%failed_node) {
|
|
my $failed_node_num = keys %failed_node;
|
|
if ($failed_node_num > 1) {
|
|
probe_utils->send_msg("stdout", "d", "There are $failed_node_num node provision failed");
|
|
} elsif ($failed_node_num == 1) {
|
|
probe_utils->send_msg("stdout", "d", "There is $failed_node_num node provision failed");
|
|
}
|
|
|
|
foreach my $node (keys %failed_node) {
|
|
if ($failed_node{$node}{non_provision_prediction}) {
|
|
probe_utils->send_msg("stdout", "f", "$node : $failed_node{$node}{non_provision_prediction}");
|
|
} else {
|
|
probe_utils->send_msg("stdout", "f", "$node : stop at stage '$::STATE_DESC{$failed_node{$node}{provision_stop_point}}'");
|
|
}
|
|
}
|
|
} else {
|
|
probe_utils->send_msg("stdout", "o", "All nodes provision successfully");
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Implement the monitor feature
|
|
Arguments:
|
|
noderange : the range of node
|
|
maxwaittime: the max waiting time for monitor.
|
|
Returns:
|
|
0: success
|
|
1: failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub do_monitor {
|
|
my $noderange = shift;
|
|
my $maxwaittime = shift;
|
|
|
|
my $rst = 0;
|
|
my $terminal = 0;
|
|
|
|
$SIG{TERM} = $SIG{INT} = sub {
|
|
$terminal = 1;
|
|
};
|
|
|
|
my $startline =
|
|
"-------------------------------------------------------------
|
|
Start capturing every message during OS provision process....
|
|
-------------------------------------------------------------
|
|
";
|
|
probe_utils->send_msg("stdout", "", "$startline");
|
|
|
|
my @openfilepids;
|
|
my @openfilefds;
|
|
my %fd_filetype_map;
|
|
|
|
{ #a very important brace to hold a code block
|
|
my $log_parse = LogParse->new($verbose);
|
|
my $candidate_log_ref = $log_parse->obtain_log_file_list();
|
|
|
|
#open candidate log file to obtain realtime log
|
|
if (%$candidate_log_ref) {
|
|
foreach my $logfile (keys %$candidate_log_ref) {
|
|
my $pid;
|
|
my $fd;
|
|
if (!($pid = open($fd, "tail -f -n 0 $candidate_log_ref->{$logfile}{file} 2>&1 |"))) {
|
|
probe_utils->send_msg("stdout", "f", "Can't open $candidate_log_ref->{$logfile}{file} to get logs");
|
|
$rst = 1;
|
|
last;
|
|
} else {
|
|
push @openfilepids, $pid;
|
|
push @openfilefds, $fd;
|
|
$fd_filetype_map{$fd} = $candidate_log_ref->{$logfile}{type};
|
|
}
|
|
}
|
|
} else {
|
|
probe_utils->send_msg("stdout", "f", "There isn't any vaild log file to be scaned");
|
|
$rst = 1;
|
|
}
|
|
|
|
last if ($rst);
|
|
|
|
my %node_state;
|
|
init_node_state($noderange, \%node_state);
|
|
|
|
my $select = new IO::Select;
|
|
$select->add(\*$_) foreach (@openfilefds);
|
|
$| = 1;
|
|
|
|
my @hdls;
|
|
my $starttime = time();
|
|
my @candidate_mn_hostname_in_log = $log_parse->obtain_candidate_mn_hostname_in_log();
|
|
|
|
#read log realtimely, then handle each log
|
|
for (; ;) {
|
|
if (@hdls = $select->can_read(0)) {
|
|
foreach my $hdl (@hdls) {
|
|
my $line = "";
|
|
chomp($line = <$hdl>);
|
|
my $log_content_ref = $log_parse->obtain_log_content($fd_filetype_map{$hdl}, $line);
|
|
dispatch_log_to_handler($log_content_ref, \@candidate_mn_hostname_in_log, \%node_state);
|
|
}
|
|
}
|
|
|
|
# stop reading log at below 3 scenarios
|
|
# 1 receive terminal signal from customer
|
|
if ($terminal) {
|
|
probe_utils->send_msg("stdout", "d", "Get INT or TERM signal from STDIN");
|
|
last;
|
|
|
|
# 2 all node have finished the provision
|
|
} elsif (all_monitor_node_done(\%node_state)) {
|
|
probe_utils->send_msg("stdout", "o", "All nodes specified to monitor, have finished OS provision process");
|
|
last;
|
|
|
|
# 3 exceed the max waiting time
|
|
} elsif (time() - $starttime > ($maxwaittime * 60)) {
|
|
probe_utils->send_msg("stdout", "i", "$maxwaittime minutes have expired, stop monitoring");
|
|
last;
|
|
} else {
|
|
sleep 0.01;
|
|
}
|
|
}
|
|
|
|
conclusion_report(\%node_state);
|
|
$log_parse->destory();
|
|
}
|
|
|
|
# close all running sub process
|
|
my $existrunningpid = 0;
|
|
$existrunningpid = 1 if (@openfilepids);
|
|
my $trytime = 0;
|
|
while ($existrunningpid) {
|
|
|
|
#send terminal signal to all running process at same time
|
|
if ($try < 5) { #try INT 5 up to 5 times
|
|
foreach my $pid (@openfilepids) {
|
|
kill 'INT', $pid if ($pid);
|
|
}
|
|
} elsif ($try < 10) { #try TERM 5 up to 5 times
|
|
foreach my $pid (@openfilepids) {
|
|
kill 'TERM', $pid if ($pid);
|
|
}
|
|
} else { #try KILL 1 time
|
|
foreach my $pid (@openfilepids) {
|
|
kill 'KILL', $pid if ($pid);
|
|
}
|
|
}
|
|
++$try;
|
|
sleep 1;
|
|
|
|
#To check how many process exit, set the flag of exited process to 0
|
|
for (my $i = 0 ; $i <= $#openfilepids ; $i++) {
|
|
$openfilepids[$i] = 0 if (waitpid($openfilepids[$i], WNOHANG));
|
|
}
|
|
|
|
#To check if there are processes still running, if there are, try kill again in next loop
|
|
$existrunningpid = 0;
|
|
$existrunningpid |= $_ foreach (@openfilepids);
|
|
|
|
#just try 10 times, if still can't kill some process, give up
|
|
if ($try > 10) {
|
|
my $leftpid;
|
|
foreach my $pid (@openfilepids) {
|
|
$leftpid .= "$pid " if ($pid);
|
|
}
|
|
probe_utils->send_msg("stdout", "d", "Can't stop process $leftpid, please handle manually.");
|
|
last;
|
|
}
|
|
}
|
|
|
|
# close all openning file descriptors
|
|
close($_) foreach (@openfilefds);
|
|
|
|
return $rst;
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Check if all node have been finished the provision process
|
|
Arguments:
|
|
node_state_ref: The reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
0: success
|
|
1: failed
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub all_monitor_node_done {
|
|
my $node_state_ref = shift;
|
|
my $done = 1;
|
|
|
|
foreach my $node (keys %$node_state_ref) {
|
|
if ($node_state_ref->{$node}{done} == 0) {
|
|
$done = 0;
|
|
last;
|
|
}
|
|
}
|
|
|
|
return $done;
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Handle one line DHCP log
|
|
Arguments:
|
|
log_ref: (input attribute) the reference of hash which save one line dhcp log.
|
|
node_state_ref: (output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub handle_dhcp_msg {
|
|
my $log_ref = shift;
|
|
my $node_state_ref = shift;
|
|
if ($log_ref->{msg} =~ /DHCPDISCOVER\s+from\s+(.+)\s+via\s+([^:]+)(.*)/i) {
|
|
my $mac = $1;
|
|
my $nic = $2;
|
|
|
|
if (exists $macmap{$mac}) {
|
|
my $node = $macmap{$mac}{"node"};
|
|
my $record = "Receive DHCPDISCOVER via $nic";
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
}
|
|
} elsif ($log_ref->{msg} =~ /DHCPOFFER\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/i) {
|
|
my $ip = $1;
|
|
my $mac = $2;
|
|
my $nic = $3;
|
|
|
|
if (exists $macmap{$mac}) {
|
|
my $node = $macmap{$mac}{"node"};
|
|
my $record = "Send DHCPOFFER on $ip back to $mac via $nic";
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
}
|
|
} elsif ($log_ref->{msg} =~ /DHCPREQUEST\s+for\s+(.+)\s+[\(\)0-9\.]*\s*from\s+(.+)\s+via\s+(.+)/) {
|
|
my $ip = $1;
|
|
my $mac = $2;
|
|
my $nic = $3;
|
|
|
|
if (exists $macmap{$mac}) {
|
|
my $node = $macmap{$mac}{"node"};
|
|
my $record = $log_ref->{msg};
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
}
|
|
} elsif ($log_ref->{msg} =~ /DHCPACK\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/) {
|
|
my $ip = $1;
|
|
my $mac = $2;
|
|
my $nic = $3;
|
|
|
|
if (exists $macmap{$mac}) {
|
|
my $node = $macmap{$mac}{"node"};
|
|
my $record = "Send DHCPACK on $ip back to $mac via $nic";
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
|
|
if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
|
|
my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different from the ip($macmap{$mac}{'ip'}) in node definition.";
|
|
|
|
#probe_utils->send_msg("stdout", "w", "$warn_msg") if ($monitor);
|
|
probe_utils->send_msg("stdout", "w", "$warn_msg");
|
|
}
|
|
|
|
$ipnodemap{$ip} = $node;
|
|
set_node_state($node_state_ref, $node, $::STATE_DHCP);
|
|
}
|
|
} elsif ($log_ref->{msg} =~ /BOOTREQUEST\s+from\s+(.+)\s+via\s+([^:]+)(.*)/) {
|
|
my $mac = $1;
|
|
my $nic = $2;
|
|
if (exists $macmap{$mac}) {
|
|
my $node = $macmap{$mac}{"node"};
|
|
my $record = "Receive BOOTREQUEST from $mac via $nic";
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
}
|
|
} elsif ($log_ref->{msg} =~ /BOOTREPLY\s+for\s+(.+)\s+to\s+.+(\w\w:\w\w:\w\w:\w\w:\w\w:\w\w).+via\s+(.+)/) {
|
|
my $ip = $1;
|
|
my $mac = $2;
|
|
my $nic = $3;
|
|
|
|
if (exists $macmap{$mac}) {
|
|
my $node = $macmap{$mac}{"node"};
|
|
my $record = "Send BOOTREPLY on $ip back to $mac via $nic";
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
|
|
if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
|
|
my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different from the ip($macmap{$mac}{'ip'}) in node definition.";
|
|
|
|
#probe_utils->send_msg("stdout", "w", "$warn_msg") if ($monitor);
|
|
probe_utils->send_msg("stdout", "w", "$warn_msg");
|
|
}
|
|
|
|
$ipnodemap{$ip} = $node;
|
|
set_node_state($node_state_ref, $node, $::STATE_DHCP);
|
|
}
|
|
}
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Handle one line TFTP log
|
|
Arguments:
|
|
log_ref: (input attribute) the reference of hash which save one line TFTP log.
|
|
node_state_ref: (output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub handle_tftp_msg {
|
|
my $log_ref = shift;
|
|
my $node_state_ref = shift;
|
|
|
|
if ($log_ref->{msg} =~ /RRQ\s+from\s+(.+)\s+filename\s+(.+)/i) {
|
|
my $ip = $1;
|
|
my $file = $2;
|
|
my $record = "Via TFTP download $file";
|
|
if (exists($node_state_ref->{ $ipnodemap{$ip} })) {
|
|
probe_utils->send_msg("stdout", "d", "[$ipnodemap{$ip}] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{ $ipnodemap{$ip} }{log} }, $log_ref->{msg}) if ($debug);
|
|
|
|
if ($file =~ /xcat\/xnba.*/i or $file =~ /\/boot\/grub2\/grub2\-/i or $file =~ /\yb\/node\/yaboot\-/i) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_BOOTLODER);
|
|
} elsif ($file =~ /vmlinuz|inst64|linux/) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_KERNEL);
|
|
} elsif ($file =~ /initrd/i) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_INITRD);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Handle one line HTTP log
|
|
Arguments:
|
|
log_ref: (input attribute) the reference of hash which save one line HTTP log.
|
|
node_state_ref: (output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub handle_http_msg {
|
|
my $log_ref = shift;
|
|
my $node_state_ref = shift;
|
|
my $ip = $log_ref->{sender};
|
|
|
|
if (exists($node_state_ref->{ $ipnodemap{$ip} })) {
|
|
|
|
if ($log_ref->{msg} =~ /GET\s+(.+)\s+HTTP.+/ or $log_ref->{msg} =~ /HEAD\s+(.+)\s+HTTP.+/) {
|
|
my $file = $1;
|
|
my $record = "Via HTTP get $file";
|
|
|
|
probe_utils->send_msg("stdout", "d", "[$ipnodemap{$ip}] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{ $ipnodemap{$ip} }{log} }, $log_ref->{msg}) if ($debug);
|
|
|
|
if ($file =~ /vmlinuz|inst64/i or ($file =~ /linux/i and $file =~ /osimage/i)) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_KERNEL);
|
|
} elsif ($file =~ /initrd/i and $file =~ /osimage/i) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_INITRD);
|
|
} elsif ($file =~ /\/install\/autoinst\/*/i) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_KICKSTART);
|
|
} elsif ($file =~ /\.deb/i or ($file =~ /\/install\// and $file =~ /\/repodata\//i)) {
|
|
set_node_state($node_state_ref, $ipnodemap{$ip}, $::STATE_INSTALLRPM);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Handle one line log comes from cluster.log
|
|
Arguments:
|
|
log_ref: (input attribute) the reference of hash which save one line log comes from cluster.log.
|
|
node_state_ref: (output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub handle_cluster_msg {
|
|
my $log_ref = shift;
|
|
my $node_state_ref = shift;
|
|
|
|
my $log_msg = $log_ref->{msg};
|
|
my @split_log = split(/\s+/, $log_msg);
|
|
|
|
if ($split_log[1] eq "Allowing") {
|
|
my $command = $split_log[2];
|
|
my $nodes_str = $split_log[4];
|
|
my $sub_command = $split_log[5];
|
|
|
|
if ($command eq "rinstall" or $command eq "rnetboot" or ($command eq "rpower" and $sub_command =~ /on|boot|reset/)) {
|
|
my @nodes = probe_utils->parse_node_range($nodes_str);
|
|
foreach my $node (@nodes) {
|
|
if (exists $node_state_ref->{$node}) {
|
|
probe_utils->send_msg("stdout", "d", "[$node] Use command $command to reboot node $node") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
set_node_state($node_state_ref, $node, $::STATE_POWER_ON);
|
|
}
|
|
}
|
|
}
|
|
} elsif ($log_ref->{msg} =~ /(.+)\s+status:\s+(.+)\s+statustime:/) {
|
|
my $node = $1;
|
|
my $status = $2;
|
|
my $record = "Node status is changed to $status";
|
|
|
|
if (exists $node_state_ref->{$node}) {
|
|
probe_utils->send_msg("stdout", "d", "[$node] $record") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
|
|
if ($status eq "installing") {
|
|
set_node_state($node_state_ref, $node, $::STATE_INSTALLING);
|
|
} elsif ($status eq "powering-on") {
|
|
set_node_state($node_state_ref, $node, $::STATE_POWERINGON);
|
|
} elsif ($status eq "booting") {
|
|
set_node_state($node_state_ref, $node, $::STATE_BOOTING);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Handle one line log comes from computes.log
|
|
Arguments:
|
|
log_ref: (input attribute) the reference of hash which save one line log comes from computes.log.
|
|
node_state_ref: (output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub handle_compute_msg {
|
|
my $log_ref = shift;
|
|
my $node_state_ref = shift;
|
|
my $node = $log_ref->{sender};
|
|
|
|
if (exists $node_state_ref->{$node}) {
|
|
probe_utils->send_msg("stdout", "d", "[$node] $log_ref->{msg}") if ($monitor);
|
|
push(@{ $node_state_ref->{$node}{log} }, $log_ref->{msg}) if ($debug);
|
|
if ($log_ref->{msg} =~ /Running postscript:/i) {
|
|
set_node_state($node_state_ref, $node, $::STATE_POSTSCRIPT);
|
|
} elsif ($log_ref->{msg} =~ /provision completed/) {
|
|
set_node_state($node_state_ref, $node, $::STATE_COMPLETED);
|
|
$node_state_ref->{$node}{done} = 1;
|
|
probe_utils->send_msg("stdout", "o", "[$node] provision completed") if ($monitor);
|
|
}
|
|
}
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Dispatch log to related handler
|
|
Arguments:
|
|
log_ref: (input attribute) the reference of hash which save one line log comes from computes.log.
|
|
candidate_mn_hostname_in_log_ref: (input attribute) The reference of array which save the candidate host name of MN
|
|
node_state_ref: (output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub dispatch_log_to_handler {
|
|
my $log_ref = shift;
|
|
my $candidate_mn_hostname_in_log_ref = shift;
|
|
my $node_state_ref = shift;
|
|
|
|
if ($log_ref->{label} == $::LOGLABEL_DHCPD) {
|
|
handle_dhcp_msg($log_ref, $node_state_ref);
|
|
} elsif ($log_ref->{label} == $::LOGLABEL_TFTP) {
|
|
handle_tftp_msg($log_ref, $node_state_ref);
|
|
} elsif ($log_ref->{label} == $::LOGLABEL_XCAT) {
|
|
if (grep(/$log_ref->{sender}/, @$candidate_mn_hostname_in_log_ref)) {
|
|
handle_cluster_msg($log_ref, $node_state_ref);
|
|
} else {
|
|
handle_compute_msg($log_ref, $node_state_ref);
|
|
}
|
|
} elsif ($log_ref->{label} == $::LOGLABEL_HTTP) {
|
|
handle_http_msg($log_ref, $node_state_ref);
|
|
}
|
|
}
|
|
|
|
#------------------------------------------
|
|
|
|
=head3
|
|
Description:
|
|
Set node state in hash %node_state
|
|
Arguments:
|
|
node_state_ref: (input/output attribute), the reference of hash "%node_state". refer to function "init_node_state" for the structure of "%node_state"
|
|
node : (input attribute) The node name
|
|
newstate : (input attribute) The new state of node
|
|
Returns:
|
|
NULL
|
|
=cut
|
|
|
|
#------------------------------------------
|
|
sub set_node_state {
|
|
my $node_state_ref = shift;
|
|
my $node = shift;
|
|
my $newstate = shift;
|
|
|
|
if ($newstate == $::STATE_POWER_ON) {
|
|
push @{ $node_state_ref->{$node}{allstatehistory} }, @{ $node_state_ref->{$node}{statehistory} };
|
|
@{ $node_state_ref->{$node}{statehistory} } = ();
|
|
push @{ $node_state_ref->{$node}{statehistory} }, $newstate;
|
|
} else {
|
|
my $index = @{ $node_state_ref->{$node}{statehistory} } - 1;
|
|
|
|
if ($node_state_ref->{$node}{statehistory}->[$index] != $newstate) {
|
|
push @{ $node_state_ref->{$node}{statehistory} }, $newstate;
|
|
}
|
|
}
|
|
}
|
|
|