2
0
mirror of https://github.com/xcat2/xcat-core.git synced 2025-05-29 09:13:08 +00:00

Merge pull request #1653 from hu-weihua/osdeploy

Implement replay and state control function for os provision probe
This commit is contained in:
zet809 2016-08-12 23:35:50 +08:00 committed by GitHub
commit 93f32c5eeb

View File

@ -9,35 +9,158 @@ use probe_utils;
use xCAT::NetworkUtils;
use File::Basename;
use IO::Select;
use Time::Local;
use Data::Dumper;
use Getopt::Long qw(:config no_ignore_case);
my $program_name = basename("$0");
my $help;
my $test;
my $output = "stdout";
my $verbose = 0;
my $rst = 0;
my $maxwaittime = 60; #unit is minute
my $output = "stdout";
my $verbose = 0;
my $rst = 0;
my $noderange;
my %rawdata;
my %ipnodemap;
my %macmap;
my $terminal = 0;
my $installnic;
my $monitor = 0;
my $replaylog; ##used by feature replay deploymen log
my %rawdata;
#-%rawdata structure-------
# $rawdata{nodename}{"history"} #array, the log history of current node
# $rawdata{nodename}{"state"} #the latest status of current node, used for State Machine
# $rawdata{nodename}{"statehistory"} #array, the history status of current node, used for State Machine
#--------------------------
my %macmap;
#-%macmap structure-------
# $macmap{mac_addr}{"ip"}="x.x.x.x"
# $macmap{mac_addr}{"node"}="nodename"
#-------------------------
my %ipnodemap;
#-%ipnodemap structure-------
# $ipnodemap{ip_addr}="nodename"
#---------------------------
my %monitor_nodes;
#- %monitor_nodes structure-------
# $monitor_nodes{nodename}{"status"} #useless now
# $monitor_nodes{nodename}{"rst"}
#---------------------------------
# provision state machine
my %state_set = (
"unknown" => 0,
"server_reboot" => 1,
"loaded_kernel_and_initrd" => 2,
"kernel_and_initrd_got_ip" => 3,
"install_os_packages" => 4,
"run_postscript" => 5,
"run_postbootscript" => 6,
"done" => 7,
);
my %state_set_reverse = (
"0" => "unknown",
"1" => "server_reboot",
"2" => "loaded_kernel_and_initrd",
"3" => "kernel_and_initrd_got_ip",
"4" => "install_os_packages",
"5" => "run_postscript",
"6" => "run_postbootscript",
"7" => "done",
);
my %valid_process;
$valid_process{1}{process} = [ $state_set{unknown}, $state_set{done} ];
$valid_process{1}{type} = "reboot";
$valid_process{2}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{done} ];
$valid_process{2}{type} = "reboot";
$valid_process{3}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ];
$valid_process{3}{type} = "reboot";
$valid_process{4}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{loaded_kernel_and_initrd}, $state_set{kernel_and_initrd_got_ip}, $state_set{install_os_packages}, $state_set{run_postscript}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ];
$valid_process{4}{type} = "deploy";
$valid_process{5}{process} = [ $state_set{unknown}, $state_set{loaded_kernel_and_initrd}, $state_set{kernel_and_initrd_got_ip}, $state_set{install_os_packages}, $state_set{run_postscript}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ];
$valid_process{5}{type} = "deploy";
sub reset_state {
my $state_ref = shift;
my $condition = shift;
my $reset_flag = 1;
if ($$state_ref == $state_set{unknown} && $condition eq "dhcp") {
$$state_ref = $state_set{server_reboot};
} elsif ($$state_ref == $state_set{unknown} && $condition eq "booted") {
$$state_ref = $state_set{done};
} elsif (($$state_ref == $state_set{unknown} && $condition eq "tftp") || ($$state_ref == $state_set{unknown} && $condition eq "http")) {
$$state_ref = $state_set{loaded_kernel_and_initrd};
} elsif ($$state_ref == $state_set{server_reboot} && $condition eq "booted") {
$$state_ref = $state_set{done};
} elsif ($$state_ref == $state_set{server_reboot} && $condition eq "postscript") {
$$state_ref = $state_set{run_postbootscript};
} elsif (($$state_ref == $state_set{server_reboot} && $condition eq "tftp") || ($$state_ref == $state_set{server_reboot} && $condition eq "http")) {
$$state_ref = $state_set{loaded_kernel_and_initrd};
} elsif ($$state_ref == $state_set{loaded_kernel_and_initrd} && $condition eq "dhcp") {
$$state_ref = $state_set{kernel_and_initrd_got_ip};
} elsif ($$state_ref == $state_set{kernel_and_initrd_got_ip} && $condition eq "http") {
$$state_ref = $state_set{install_os_packages};
} elsif ($$state_ref == $state_set{install_os_packages} && $condition eq "postscript") {
$$state_ref = $state_set{run_postscript};
} elsif ($$state_ref == $state_set{run_postscript} && $condition eq "dhcp") {
$$state_ref = $state_set{server_reboot};
} elsif ($$state_ref == $state_set{run_postbootscript} && $condition eq "booted") {
$$state_ref = $state_set{done};
} elsif ($$state_ref == $state_set{done} && $condition eq "dhcp") {
$$state_ref = $state_set{server_reboot};
} elsif ($$state_ref == $state_set{done} && $condition eq "poweron") {
$$state_ref = $state_set{server_reboot};
} else {
$reset_flag = 0;
}
return $reset_flag;
}
my @candidate_svr_hostname_inlog;
my $svr_hostname_short = `hostname -s`;
chomp($svr_hostname_short);
my $svr_hostname_domain = `hostname -d`;
chomp($svr_hostname_domain);
push(@candidate_svr_hostname_inlog, $svr_hostname_short);
push(@candidate_svr_hostname_inlog, "$svr_hostname_short.$svr_hostname_domain");
$::USAGE = "Usage:
$program_name -h
$program_name -T
$program_name -n <node_range> [-V]
$program_name -n <node_range> -r <xxhxxm> [-V]
Description:
Do probe for os provision process, realtime monitor of os provision process.
Please run this before rpower node.
Do probe for os provision process. Realtime monitor or replay history of os provision process.
If do realtime monitor, please run this before rpower node.
Unsupport hierarchial structure now.
Options:
-h : Get usage information of $program_name
-T : To verify if $program_name can work, reserve option for probe framework
-V : Output more information for debug
-n : The range of monitored node.
-n : The range of node to be monitor or replay log.
-t : The maximum time to wait when doing monitor, the unit is minute, default is 60 minutes.
-r : Replay history log to probe provision. need input a start time when probe should begin from.
Support time format are xxhxxm, xxh, or xxm. h means hour, m means minute.
If there isn't unit input, using hour by default.
";
#------------------------------------------
@ -54,58 +177,74 @@ Options:
#------------------------------------------
sub check_noderange {
my $node_range = shift;
my @cmdoutput = `lsdef $node_range -i ip,mac 2>&1`;
my $rst = 0;
my $currentnode = "";
my $ip = "NOIP";
my $node_range = shift;
my @cmdoutput = `lsdef $node_range -i ip,mac -c 2>&1`;
my $rst = 0;
my %nodecheckrst;
my $mac_line;
my @macs;
foreach (@cmdoutput) {
chomp($_);
$_ =~ s/^\s+|\s+$//g;
if ($_ =~ /^Error: Could not find an object named '(\w+)' .+/i) {
$currentnode = $1;
$nodecheckrst{$currentnode}{"error"} = "Could not find node definition";
$rst = 1;
} elsif ($_ =~ /^\s*Object name: (\w+)/i) {
# 'rst' is used to check whether the node process finished, 1 is finished.
$monitor_nodes{$1}{"rst"} = 0;
$currentnode = $1;
$ip = "NOIP";
} elsif ($_ =~ /^ip=(.+)/i) {
if ($1) {
$ip = $1;
$nodecheckrst{$1}{"error"} = "Could not find node definition";
} elsif ($_ =~ /(\S+):\s+mac=(.*)/i) {
my $node = $1;
my $mac = $2;
if ($mac) {
$nodecheckrst{$node}{"mac"} = $mac;
} else {
$nodecheckrst{$node}{"error"} = "Node $1 doesn't have MAC address";
}
} elsif ($_ =~ /^mac=(.+)/i) {
next unless ($1);
$mac_line = $1;
@macs = split(/\|/, $mac_line);
foreach my $mac (@macs) {
if ($mac =~ /\!\*NOIP\*/) {
$mac =~ s/\!\*NOIP\*//g;
$macmap{$mac}{"ip"} = "NOIP";
$macmap{$mac}{"node"} = $currentnode;
}
else {
$macmap{$mac}{"ip"} = $ip;
$macmap{$mac}{"node"} = $currentnode;
}
} elsif ($_ =~ /(\S+):\s+ip=(.*)/i) {
my $node = $1;
my $ip = $2;
if ($ip) {
$nodecheckrst{$node}{"ip"} = $ip;
}
}
}
foreach my $node (keys %nodecheckrst) {
probe_utils->send_msg("$output", "d", "$node : $nodecheckrst{$node}{error}") if (exists($nodecheckrst{$node}{error}));
if (exists($nodecheckrst{$node}{error})) {
probe_utils->send_msg("$output", "d", "$node : $nodecheckrst{$node}{error}");
$rst = 1;
next;
}
my $noerror=1;
my @macs = split(/\|/, $nodecheckrst{$node}{"mac"});
foreach my $mac (@macs) {
#[NOTE] don't support 2 adapters in the same network now. TODO
if ($mac =~ /\!\*NOIP\*/) {
$mac =~ s/\!\*NOIP\*//g;
$macmap{$mac}{"ip"} = "NOIP";
$macmap{$mac}{"node"} = $node;
} else {
$macmap{$mac}{"node"} = $node;
if($nodecheckrst{$node}{"ip"}){
$macmap{$mac}{"ip"} = $nodecheckrst{$node}{"ip"};
}else{
my $nodeip = xCAT::NetworkUtils->getipaddr($node);
if($nodeip){
$macmap{$mac}{"ip"} = $nodeip;
}else{
$noerror = 0;
$rst = 1;
probe_utils->send_msg("$output", "f", "$node : can't be resolved to a IP address");
}
}
}
}
$monitor_nodes{$node}{"rst"} = 0 if($noerror);
}
unless (%monitor_nodes) {
probe_utils->send_msg("$output", "d", "There is no node to be monitored");
probe_utils->send_msg("$output", "d", "There is no valid node to handle");
$rst = 1;
}
return $rst;
}
@ -116,7 +255,7 @@ sub check_noderange {
Handle one line log come from dhcp log file
Arguments:
msg: one line http log
nics: target network interfaces
installnic: target network interfaces
Returns:
0 : pass
1 : failed
@ -132,8 +271,8 @@ sub handle_dhcp_msg {
if (exists $macmap{$mac}) {
my $node = $macmap{$mac}{"node"};
my $record = "Receive DHCPDISCOVER from [$node] $mac via $nic";
probe_utils->send_msg("$output", "d", "$record");
my $record = "Receive DHCPDISCOVER via $nic";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
}
} elsif ($msg =~ /.+DHCPOFFER\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/i) {
@ -143,8 +282,8 @@ sub handle_dhcp_msg {
if (exists $macmap{$mac}) {
my $node = $macmap{$mac}{"node"};
my $record = "Send DHCPOFFER on $ip back to [$node] $mac via $nic";
probe_utils->send_msg("$output", "d", "$record");
my $record = "Send DHCPOFFER on $ip back to $mac via $nic";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
}
@ -155,16 +294,10 @@ sub handle_dhcp_msg {
my $nic = $4;
if (exists $macmap{$mac}) {
my $node = $macmap{$mac}{"node"};
my $record = "Receive DHCPREQUEST from [$node] $mac for $ip via $nic";
probe_utils->send_msg("$output", "d", "$record");
my $node = $macmap{$mac}{"node"};
my $record = "Receive DHCPREQUEST from $mac for $ip via $nic";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
my $warn_msg = "The ip of [$node] $mac from DHCP $ip is different with definition $macmap{$mac}{'ip'}.";
probe_utils->send_msg("$output", "w", "$warn_msg");
push(@{ $rawdata{$node}{"history"} }, $warn_msg);
}
}
} elsif ($msg =~ /.+DHCPACK\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/) {
my $ip = $1;
@ -173,19 +306,25 @@ sub handle_dhcp_msg {
if (exists $macmap{$mac}) {
my $node = $macmap{$mac}{"node"};
my $record = "Send DHCPACK on $ip back to [$node] $mac via $nic";
probe_utils->send_msg("$output", "d", "$record");
my $record = "Send DHCPACK on $ip back to $mac via $nic";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
$ipnodemap{$ip} = $node;
push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "dhcp"));
if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different with the ip($macmap{$mac}{'ip'}) in node definition.";
probe_utils->send_msg("$output", "w", "$warn_msg") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $warn_msg);
}
}
} elsif ($msg =~ /.+BOOTREQUEST\s+from\s+(.+)\s+via\s+([^:]+)(.*)/) {
my $mac = $1;
my $nic = $2;
if (exists $macmap{$mac}) {
my $node = $macmap{$mac}{"node"};
my $record = "Receive BOOTREQUEST from [$node] $mac via $nic";
probe_utils->send_msg("$output", "d", "$record");
my $record = "Receive BOOTREQUEST from $mac via $nic";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
}
} elsif ($msg =~ /.+BOOTREPLY\s+for\s+(.+)\s+to\s+.+(\w\w:\w\w:\w\w:\w\w:\w\w:\w\w).+via\s+(.+)/) {
@ -195,14 +334,15 @@ sub handle_dhcp_msg {
if (exists $macmap{$mac}) {
my $node = $macmap{$mac}{"node"};
my $record = "Send BOOTREPLY on $ip back to [$node] $mac via $nic";
probe_utils->send_msg("$output", "d", "$record");
my $record = "Send BOOTREPLY on $ip back to $mac via $nic";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
$ipnodemap{$ip} = $node;
push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "dhcp"));
if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
my $warn_msg = "The ip of [$node] $mac from DHCP $ip is different with definition $macmap{$mac}{'ip'}.";
probe_utils->send_msg("$output", "w", "$warn_msg");
my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different with the ip($macmap{$mac}{'ip'}) in node definition.";
probe_utils->send_msg("$output", "w", "$warn_msg") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $warn_msg);
}
}
@ -230,11 +370,11 @@ sub handle_tftp_msg {
if ($msg =~ /RRQ\s+from\s+(.+)\s+filename\s+(.+)/i) {
my $ip = $1;
my $file = $2;
my $record = "[$ipnodemap{$ip}] Via TFTP $ip download $file";
my $record = "Via TFTP download $file";
if (exists($rawdata{"$ipnodemap{$ip}"})) {
probe_utils->send_msg("$output", "d", "$record");
probe_utils->send_msg("$output", "d", "[$ipnodemap{$ip}] $record") if ($monitor);
push(@{ $rawdata{ $ipnodemap{$ip} }{"history"} }, $record);
push(@{ $rawdata{ $ipnodemap{$ip} }{statehistory} }, $rawdata{ $ipnodemap{$ip} }{state}) if (reset_state(\$rawdata{ $ipnodemap{$ip} }{state}, "tftp"));
}
}
}
@ -257,11 +397,12 @@ sub handle_http_msg {
if ($msg =~ /(\d+\.\d+.\d+.\d+)\s.+GET\s+(.+)\s+HTTP.+/) {
my $ip = $1;
my $file = $2;
my $record = "[$ipnodemap{$ip}] Via HTTP $ip GET $file";
my $record = "Via HTTP get $file";
if (exists($rawdata{"$ipnodemap{$ip}"})) {
probe_utils->send_msg("$output", "d", "$record");
probe_utils->send_msg("$output", "d", "[$ipnodemap{$ip}] $record") if ($monitor);
push(@{ $rawdata{ $ipnodemap{$ip} }{"history"} }, $record);
push(@{ $rawdata{ $ipnodemap{$ip} }{statehistory} }, $rawdata{ $ipnodemap{$ip} }{state}) if (reset_state(\$rawdata{ $ipnodemap{$ip} }{state}, "http"));
}
}
return 0;
@ -286,37 +427,29 @@ sub handle_cluster_msg {
my $msg;
my $status;
if ($line =~ /.+\s+xcat:\s+(.+)\s+status:\s+(.+)\s+statustime:\s(.+)/) {
$node = $1;
$status = $2;
my @splitline = split(/\s+/, $line);
if (($splitline[4] =~ /^xcat/i) || ($splitline[5] =~ /^xcat/i)) {
if (exists($rawdata{$node})) {
my $record = "Receive from $node : status is $status";
probe_utils->send_msg("$output", "d", "$record");
push(@{ $rawdata{$node}{"history"} }, $record);
}
#log like: Aug 7 22:30:31 c910f02c01p09 xcat: c910f02c04p04 status: booted statustime: 08-07-2016 22:30:31
if (($splitline[6] =~ /^status:$/i) && ($splitline[8] =~ /^statustime:$/)) {
$node = $splitline[5];
$status = $splitline[7];
if (exists($rawdata{$node})) {
my $record = "Node status is changed to $status";
probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $record);
# When receive 'status is booted', check whether the $monitor_nodes{$node}{"status"} is installing.
# If so, the node has finished its os provision.
if (exists($rawdata{$node}) and ($status eq "booted")) {
if ($monitor_nodes{$node}{"status"} eq "installing") {
$record = "Node $node has finished it's os provision process";
probe_utils->send_msg("$output", "o", "$record");
push(@{ $rawdata{$node}{"history"} }, $record);
} else {
$record = "NO installing process detected for node $node";
probe_utils->send_msg("$output", "f", "$record");
push(@{ $rawdata{$node}{"history"} }, $record);
#one node finish deployment
if ($status eq "booted") {
$monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node}));
push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "booted"));
}
if ($status eq "powering-on") {
push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "poweron"));
}
}
$monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node}));
} elsif (exists($rawdata{$node}) and ($status eq "failed")) {
$monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node}));
probe_utils->send_msg("$output", "f", "Node $node has finished it's os provision process");
push(@{ $rawdata{$node}{"history"} }, "Node $node os provision failed");
} elsif (exists($rawdata{$node}) and ($status eq "installing")) {
# record 'installing' status, to check when receive 'booted' status
$monitor_nodes{$node}{"status"} = "installing";
}
}
return 0;
@ -350,10 +483,15 @@ sub handle_compute_msg {
} else {
$node = $ipnodemap{$sender};
}
if ($node ne "" && exists($rawdata{$node})) {
my $record = "Receive from $node : $msg";
probe_utils->send_msg("$output", "d", "$record");
push(@{ $rawdata{$node}{"history"} }, $record);
probe_utils->send_msg("$output", "d", "[$node] $msg") if ($monitor);
push(@{ $rawdata{$node}{"history"} }, $msg);
#node start to run postscript or postbootscript
if ($msg =~ /Running postscript/) {
push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "postscript"));
}
}
}
@ -407,30 +545,113 @@ sub dump_history {
print "$title\n";
foreach $node (keys %rawdata) {
my $line_num = 0;
my $http_num = 0;
my $length_http;
for (my $i = @{ $rawdata{$node}{"history"} } ; $i >= 0 ; $i--) {
if (${ $rawdata{$node}{"history"} }[$i] =~ /Via HTTP/) {
$length_http = $i;
last;
if ($verbose) {
print "[$node]\n";
my $httphit = 0;
my @httptmp;
foreach my $line (@{ $rawdata{$node}{"history"} }) {
if ($line =~ /Via HTTP/) {
if ($httphit) {
shift @httptmp if ($#httptmp > 0);
push @httptmp, $line;
} else {
print "\t$line\n";
$httphit = 1;
}
} else {
if ($#httptmp > -1) {
print "\tVia HTTP ..........\n";
print "\t$_\n" foreach (@httptmp);
}
@httptmp = ();
$httphit = 0;
print "\t$line\n";
}
}
}
foreach my $line (@{ $rawdata{$node}{"history"} }) {
# Print http message less than 10 lines
if ($line =~ /Via HTTP/)
{
if (($http_num <= 4) or ($length_http - $line_num <= 4)) {
probe_utils->send_msg("$output", "d", "\t$line");
my @tmpnodestatehistory = @{ $rawdata{$node}{statehistory} };
#print "state history = @tmpnodestatehistory\n";
my %match_result;
my $procidx = 0;
my $newloop = 0;
my $notfirstloop = 0;
while (@tmpnodestatehistory) {
undef %match_result if ($notfirstloop);
$newloop = 0;
foreach my $type (keys %valid_process) {
if ($notfirstloop) {
$procidx = 1;
} else {
probe_utils->send_msg("$output", "d", "\t......") if ($http_num == 5);
$procidx = 0;
}
$http_num++;
} else {
probe_utils->send_msg("$output", "d", "\t$line");
my $proclen = scalar(@{ $valid_process{$type}{process} });
my $i;
for ($i = 0 ; $i < scalar(@tmpnodestatehistory) ; $i++) {
if ($procidx < $proclen) {
if ($tmpnodestatehistory[$i] == $valid_process{$type}{process}[$procidx]) {
++$procidx;
} else {
--$procidx if (notfirstloop);
push @{ $match_result{$procidx} }, $type;
last;
}
} else {
splice(@tmpnodestatehistory, 0, $i);
$newloop = 1;
$notfirstloop = 1;
last;
}
}
if ($i == scalar(@tmpnodestatehistory)) {
push @{ $match_result{$procidx} }, $type;
next;
}
last if ($newloop);
}
last if (!$newloop);
}
#print "------------result---------------\n";
#print Dumper %match_result;
my $max_match = 0;
foreach my $key (keys %match_result) {
$max_match = $key if ($key > $max_match);
}
my $formatprefix;
if ($max_match == 0) {
my $statelist = "";
for (my $i = 0 ; $i < scalar(@{ $rawdata{$node}{statehistory} }) ; $i++) {
$statelist .= "$state_set_reverse{$rawdata{$node}{statehistory}[$i]} ";
}
probe_utils->send_msg("$output", "f", "[$node] deployment failed");
probe_utils->send_msg("$output", "d", "\t$node did unknown process, state change history is $statelist");
} elsif (scalar(@{ $match_result{$max_match} }) > 1) {
probe_utils->send_msg("$output", "f", "[$node] deployment failed");
probe_utils->send_msg("$output", "d", "\tThere are more than one possible process satisfy $node situation");
foreach my $proc (@{ $match_result{$max_match} }) {
if ($valid_process{$proc}{type} eq "deploy") {
probe_utils->send_msg("$output", "d", "\tpossible process \"deploy\", pass $state_set_reverse{$valid_process{$proc}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$proc}{process}[$max_match]} stage");
} elsif ($valid_process{$proc}{type} eq "reboot") {
probe_utils->send_msg("$output", "d", "\tpossible process \"reboot\", pass $state_set_reverse{$valid_process{$proc}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$proc}{process}[$max_match]} stage");
}
}
} else {
if (($valid_process{ $match_result{$max_match}[0] }{type} eq "deploy") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] == $state_set{done})) {
probe_utils->send_msg("$output", "o", "[$node] deployment completed");
} elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "deploy") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] != $state_set{done})) {
probe_utils->send_msg("$output", "f", "[$node] deployment failed, pass $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match]} stage")
} elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "reboot") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] == $state_set{done})) {
probe_utils->send_msg("$output", "f", "[$node] reboot completed, without deployment process");
} elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "reboot") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] != $state_set{done})) {
probe_utils->send_msg("$output", "f", "[$node] reboot failed,without deployment process, stop at $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match]} stage");
}
$line_num++;
}
}
}
@ -451,29 +672,6 @@ sub do_monitor {
$terminal = 1;
};
my $msg = "All pre_defined nodes are valid";
my $rc = check_noderange($noderange);
if ($rc) {
probe_utils->send_msg("$output", "f", $msg);
$rst = 1;
unless (%monitor_nodes) {
return $rst;
}
} else {
probe_utils->send_msg("$output", "o", $msg);
}
if (!$nics) {
my $masteripinsite = `tabdump site | awk -F',' '/^"master",/ { gsub(/"/, "", \$2) ; print \$2 }'`;
chomp($masteripinsite);
$nics = `ip addr |grep -B2 $masteripinsite|awk -F" " '/mtu/{gsub(/:/,"",\$2); print \$2}'`;
chomp($nics);
if (!$nics) {
probe_utils->send_msg("$output", "f", "The value of master in site table is $masteripinsite, can't get corresponding network interface");
return 1;
}
}
my $rst = 0;
my $startline =
"-------------------------------------------------------------
@ -514,6 +712,7 @@ Start to capture every message during os provision process......
if (!-e "$clusterlog") {
probe_utils->send_msg("$output", "w", "$clusterlog doesn't exist");
probe_utils->send_msg("$output", "i", "If sles11 or xCAT2.11.x is using, please ignore above warning");
} else {
if (!($clusterpid = open(CLUSTERLOGFILE, "tail -f -n 0 $clusterlog 2>&1 |"))) {
probe_utils->send_msg("$output", "f", "Can't open $clusterlog to get logs");
@ -532,6 +731,7 @@ Start to capture every message during os provision process......
}
if (!-e "$computelog") {
probe_utils->send_msg("$output", "w", "$computelog doesn't exist");
probe_utils->send_msg("$output", "i", "If sles11 or xCAT2.11.x is using, please ignore above warning");
} else {
if (!($computerpid = open(COMPUTERFILE, "tail -f -n 0 $computelog 2>&1 |"))) {
probe_utils->send_msg("$output", "f", "Can't open $computelog to get logs");
@ -551,21 +751,15 @@ Start to capture every message during os provision process......
my @hdls;
my $hdl;
my @candidate_svr_hostname_inlog;
my $svr_hostname_short = `hostname -s`;
chomp($svr_hostname_short);
my $svr_hostname_domain = `hostname -d`;
chomp($svr_hostname_domain);
push(@candidate_svr_hostname_inlog, $svr_hostname_short);
push(@candidate_svr_hostname_inlog, "$svr_hostname_short.$svr_hostname_domain");
my $starttime = time();
$monitor = 1;
for (; ;) {
if (@hdls = $select->can_read(0)) {
foreach $hdl (@hdls) {
if ($hdl == \*VARLOGMSGFILE) {
chomp($line = <VARLOGMSGFILE>);
my @tmp = split(/\s+/, $line);
if ($tmp[4] =~ /dhcpd:/i && $line =~ /$nics/) {
if ($tmp[4] =~ /dhcpd:/i && $line =~ /$installnic/) {
handle_dhcp_msg("$line");
} elsif ($tmp[4] =~ /in.tftpd/i) {
handle_tftp_msg("$line");
@ -596,7 +790,13 @@ Start to capture every message during os provision process......
probe_utils->send_msg("$output", "o", "All nodes need to monitor have finished os provision process");
}
last;
} sleep 0.01;
}
if (time() - $starttime > ($maxwaittime * 60)) {
probe_utils->send_msg("$output", "i", "$maxwaittime minutes are expired, stop monitor");
last;
}
sleep 0.01;
}
&dump_history;
}
@ -612,6 +812,177 @@ Start to capture every message during os provision process......
return $rst;
}
sub get_valid_logs {
my $ref_timestamp = shift;
my $year = shift;
my $epoch_seconds_of_now = shift;
my $bthistory_ref = shift;
my $nics = "eth0";
my @orglogfilelist = ("/var/log/xcat/cluster.log",
"/var/log/messages",
"/var/log/xcat/computes.log");
my $httplog;
if (-e "/var/log/httpd/access_log") {
$httplog = "/var/log/httpd/access_log";
} elsif (-e "/var/log/apache2/access_log") {
$httplog = "/var/log/apache2/access_log";
} elsif (-e "/var/log/apache2/access.log") {
$httplog = "/var/log/apache2/access.log";
}
push @orglogfilelist, $httplog;
foreach my $f (@orglogfilelist) {
my $filename = basename("$f");
$filename =~ s/(.+)\.(.+)/$1/g;
my $path_only = dirname("$f");
my @rotatefiles;
#TODO using opendir to refine below code
my @alltargetfiles = `ls -lt $path_only |awk -F" " '/ $filename/ {print \$9}'`;
foreach my $samenamefile (@alltargetfiles) {
chomp($samenamefile);
push @rotatefiles, "$path_only/$samenamefile";
}
my $ishttplog = 0;
$ishttplog = 1 if ($filename =~ /access[\._]log/);
foreach my $file (@rotatefiles) {
my $fd;
my $filetype = `file $file 2>&1`;
chomp($filetype);
if ($filetype =~ /ASCII text/) {
if (!open($fd, "$file")) {
print "open $files failed\n";
next;
}
} else {
#TODO handle compression files
}
#print "--->load $file\n";
my $line;
my $historynum = 0;
last unless ($line = <$fd>);
chomp($line);
my $needrotate = 0;
my $logindex = 0;
my @splitline = split(/\s+/, $line);
my $timestamp;
my $timestampepoch;
if ($ishttplog) {
$splitline[3] =~ s/^\[(.+)/$1/g;
$timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]);
} else {
$timestamp = join(" ", @splitline[ 0 .. 2 ]);
$timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now);
}
if ($ref_timestamp <= $timestampepoch) {
$needrotate = 1;
} else {
seek($fd, 0, 2);
my $tail = tell;
my $head = 0;
my $lasttail = $tail;
my $i = 0;
while ($head <= $tail) {
my $middle = int(($tail - $head) / 2) + $head;
seek($fd, $middle, 0);
$line = <$fd>;
$middle += length($line);
last unless ($line = <$fd>);
@splitline = split(/\s+/, $line);
if ($ishttplog) {
$splitline[3] =~ s/^\[(.+)/$1/g;
$timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]);
} else {
$timestamp = join(" ", @splitline[ 0 .. 2 ]);
$timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now);
} if ($ref_timestamp == $timestampepoch) {
$historynum = $middle;
last;
} elsif ($ref_timestamp < $timestampepoch) {
$tail = $middle;
last if ($tail == $lasttail);
$lasttail = $tail;
} else {
$head = $middle;
}
}
$historynum = $head unless ($historynum);
}
seek($fd, $historynum, 0);
while (<$fd>) {
chomp;
@splitline = split(/\s+/, $_);
if ($ishttplog) {
$splitline[3] =~ s/^\[(.+)/$1/g;
$timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]);
} else {
$timestamp = join(" ", @splitline[ 0 .. 2 ]);
$timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now);
}
if (($splitline[4] =~ /dhcpd:/i && $_ =~ /$nics/)
|| ($splitline[4] =~ /in.tftpd/i)
|| (($splitline[4] =~ /^xcat/i) || ($splitline[5] =~ /^xcat/i))
|| ($splitline[5] =~ /GET/ && $splitline[7] =~ /HTTP/)) {
my $log = "$timestampepoch $filename$logindex $_";
$logindex++;
push @$bthistory_ref, $log;
}
}
close($fd);
last unless ($needrotate);
}
}
#sort logs depending on time
my @sort_ht = sort(@$bthistory_ref);
for ($i = 0 ; $i <= $#sort_ht ; $i++) {
$sort_ht[$i] =~ s/^(\d+) (\S+) (.+)/$1 $3/g;
}
#delete duplicate logs
my %count;
@$bthistory_ref = grep { ++$count{$_} < 2; } @sort_ht;
}
sub do_replay {
my $ref_timestamp = shift;
my $timestr = scalar(localtime($ref_timestamp));
print "Start to search logs after '$timestr', please waiting for a while.............\n";
my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time());
my $epoch_seconds_of_now = time();
my @bthistory;
get_valid_logs($ref_timestamp, $year, $epoch_seconds_of_now, \@bthistory);
foreach my $line (@bthistory) {
$line =~ s/(\d+) (.+)/$2/g;
my @tmp = split(/\s+/, $line);
if ($tmp[4] =~ /dhcpd:/i && $line =~ /$nics/) {
handle_dhcp_msg("$line");
} elsif ($tmp[4] =~ /in.tftpd/i) {
handle_tftp_msg("$line");
} elsif (($tmp[4] =~ /^xcat/i) || ($tmp[5] =~ /^xcat/i)) {
if (grep(/^$tmp[3]$/, @candidate_svr_hostname_inlog)) {
handle_cluster_msg("$line");
} else {
handle_compute_msg("$line");
}
} elsif ($tmp[5] =~ /GET/ && $tmp[7] =~ /HTTP/) {
handle_http_msg("$line");
}
}
&dump_history;
return 0;
}
#-------------------------------------
# main process
#-------------------------------------
@ -619,6 +990,8 @@ if (
!GetOptions("--help|h|?" => \$help,
"T" => \$test,
"V" => \$verbose,
"t" => \$maxwaittime,
"r=s" => \$replaylog,
"n=s" => \$noderange))
{
probe_utils->send_msg("$output", "f", "Invalid parameter for $program_name");
@ -641,11 +1014,60 @@ if ($test) {
}
unless ($noderange) {
probe_utils->send_msg("$output", "f", "Option -n is required");
probe_utils->send_msg("$output", "f", "A noderange is needed");
probe_utils->send_msg("$output", "d", "$::USAGE");
exit 1;
}
my $epoch_starttime = time();
if ($replaylog) {
if ($replaylog =~ /(\d+)h(\d+)m/i) {
$epoch_starttime -= ($1 * 3600 + $2 * 60)
} elsif ($replaylog =~ /^(\d+)h*$/i) {
$epoch_starttime -= $1 * 3600;
} elsif ($replaylog =~ /^(\d+)m$/) {
$epoch_starttime -= $1 * 60;
} else {
probe_utils->send_msg("$output", "f", "Unsupport time format for replay history log");
print "$::USAGE";
exit 1;
}
}
my $msg = "All pre_defined nodes are valid";
my $rc = check_noderange($noderange);
if ($rc) {
probe_utils->send_msg("$output", "f", $msg);
$rst = 1;
} else {
probe_utils->send_msg("$output", "o", $msg);
}
unless ($installnic) {
my $masteripinsite = `tabdump site | awk -F',' '/^"master",/ { gsub(/"/, "", \$2) ; print \$2 }'`;
chomp($masteripinsite);
$installnic = `ip addr |grep -B2 $masteripinsite|awk -F" " '/mtu/{gsub(/:/,"",\$2); print \$2}'`;
chomp($installnic);
if (!$installnic) {
probe_utils->send_msg("$output", "f", "The value of master in site table is $masteripinsite, can't get corresponding network interface");
$rst = 1;
} else {
probe_utils->send_msg("$output", "i", "The installation network interface is $installnic");
}
}
exit $rst if ($rst);
foreach my $node (keys %monitor_nodes) {
$rawdata{$node}{state} = $state_set{unknown};
push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state});
}
if ($replaylog) {
$rst = do_replay($epoch_starttime);
exit $rst;
}
$rst = do_monitor();
exit $rst;