diff --git a/xCAT-probe/subcmds/osdeploy b/xCAT-probe/subcmds/osdeploy index dabbd16fe..5d3a8e025 100755 --- a/xCAT-probe/subcmds/osdeploy +++ b/xCAT-probe/subcmds/osdeploy @@ -9,35 +9,158 @@ use probe_utils; use xCAT::NetworkUtils; use File::Basename; use IO::Select; +use Time::Local; +use Data::Dumper; use Getopt::Long qw(:config no_ignore_case); my $program_name = basename("$0"); my $help; my $test; -my $output = "stdout"; -my $verbose = 0; -my $rst = 0; +my $maxwaittime = 60; #unit is minute +my $output = "stdout"; +my $verbose = 0; +my $rst = 0; my $noderange; -my %rawdata; -my %ipnodemap; -my %macmap; my $terminal = 0; +my $installnic; +my $monitor = 0; +my $replaylog; ##used by feature replay deploymen log + +my %rawdata; + +#-%rawdata structure------- +# $rawdata{nodename}{"history"} #array, the log history of current node +# $rawdata{nodename}{"state"} #the latest status of current node, used for State Machine +# $rawdata{nodename}{"statehistory"} #array, the history status of current node, used for State Machine +#-------------------------- + +my %macmap; + +#-%macmap structure------- +# $macmap{mac_addr}{"ip"}="x.x.x.x" +# $macmap{mac_addr}{"node"}="nodename" +#------------------------- + +my %ipnodemap; + +#-%ipnodemap structure------- +# $ipnodemap{ip_addr}="nodename" +#--------------------------- + my %monitor_nodes; +#- %monitor_nodes structure------- +# $monitor_nodes{nodename}{"status"} #useless now +# $monitor_nodes{nodename}{"rst"} +#--------------------------------- + +# provision state machine +my %state_set = ( + "unknown" => 0, + "server_reboot" => 1, + "loaded_kernel_and_initrd" => 2, + "kernel_and_initrd_got_ip" => 3, + "install_os_packages" => 4, + "run_postscript" => 5, + "run_postbootscript" => 6, + "done" => 7, +); + +my %state_set_reverse = ( + "0" => "unknown", + "1" => "server_reboot", + "2" => "loaded_kernel_and_initrd", + "3" => "kernel_and_initrd_got_ip", + "4" => "install_os_packages", + "5" => "run_postscript", + "6" => "run_postbootscript", + "7" => "done", +); + +my %valid_process; +$valid_process{1}{process} = [ $state_set{unknown}, $state_set{done} ]; +$valid_process{1}{type} = "reboot"; + +$valid_process{2}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{done} ]; +$valid_process{2}{type} = "reboot"; + +$valid_process{3}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ]; +$valid_process{3}{type} = "reboot"; + +$valid_process{4}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{loaded_kernel_and_initrd}, $state_set{kernel_and_initrd_got_ip}, $state_set{install_os_packages}, $state_set{run_postscript}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ]; +$valid_process{4}{type} = "deploy"; + +$valid_process{5}{process} = [ $state_set{unknown}, $state_set{loaded_kernel_and_initrd}, $state_set{kernel_and_initrd_got_ip}, $state_set{install_os_packages}, $state_set{run_postscript}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ]; +$valid_process{5}{type} = "deploy"; + +sub reset_state { + my $state_ref = shift; + my $condition = shift; + my $reset_flag = 1; + + if ($$state_ref == $state_set{unknown} && $condition eq "dhcp") { + $$state_ref = $state_set{server_reboot}; + } elsif ($$state_ref == $state_set{unknown} && $condition eq "booted") { + $$state_ref = $state_set{done}; + } elsif (($$state_ref == $state_set{unknown} && $condition eq "tftp") || ($$state_ref == $state_set{unknown} && $condition eq "http")) { + $$state_ref = $state_set{loaded_kernel_and_initrd}; + } elsif ($$state_ref == $state_set{server_reboot} && $condition eq "booted") { + $$state_ref = $state_set{done}; + } elsif ($$state_ref == $state_set{server_reboot} && $condition eq "postscript") { + $$state_ref = $state_set{run_postbootscript}; + } elsif (($$state_ref == $state_set{server_reboot} && $condition eq "tftp") || ($$state_ref == $state_set{server_reboot} && $condition eq "http")) { + $$state_ref = $state_set{loaded_kernel_and_initrd}; + } elsif ($$state_ref == $state_set{loaded_kernel_and_initrd} && $condition eq "dhcp") { + $$state_ref = $state_set{kernel_and_initrd_got_ip}; + } elsif ($$state_ref == $state_set{kernel_and_initrd_got_ip} && $condition eq "http") { + $$state_ref = $state_set{install_os_packages}; + } elsif ($$state_ref == $state_set{install_os_packages} && $condition eq "postscript") { + $$state_ref = $state_set{run_postscript}; + } elsif ($$state_ref == $state_set{run_postscript} && $condition eq "dhcp") { + $$state_ref = $state_set{server_reboot}; + } elsif ($$state_ref == $state_set{run_postbootscript} && $condition eq "booted") { + $$state_ref = $state_set{done}; + } elsif ($$state_ref == $state_set{done} && $condition eq "dhcp") { + $$state_ref = $state_set{server_reboot}; + } elsif ($$state_ref == $state_set{done} && $condition eq "poweron") { + $$state_ref = $state_set{server_reboot}; + } else { + $reset_flag = 0; + } + + return $reset_flag; +} + + +my @candidate_svr_hostname_inlog; +my $svr_hostname_short = `hostname -s`; +chomp($svr_hostname_short); +my $svr_hostname_domain = `hostname -d`; +chomp($svr_hostname_domain); +push(@candidate_svr_hostname_inlog, $svr_hostname_short); +push(@candidate_svr_hostname_inlog, "$svr_hostname_short.$svr_hostname_domain"); + + $::USAGE = "Usage: $program_name -h $program_name -T $program_name -n [-V] + $program_name -n -r [-V] Description: - Do probe for os provision process, realtime monitor of os provision process. - Please run this before rpower node. + Do probe for os provision process. Realtime monitor or replay history of os provision process. + If do realtime monitor, please run this before rpower node. + Unsupport hierarchial structure now. Options: -h : Get usage information of $program_name -T : To verify if $program_name can work, reserve option for probe framework -V : Output more information for debug - -n : The range of monitored node. + -n : The range of node to be monitor or replay log. + -t : The maximum time to wait when doing monitor, the unit is minute, default is 60 minutes. + -r : Replay history log to probe provision. need input a start time when probe should begin from. + Support time format are xxhxxm, xxh, or xxm. h means hour, m means minute. + If there isn't unit input, using hour by default. "; #------------------------------------------ @@ -54,58 +177,74 @@ Options: #------------------------------------------ sub check_noderange { - my $node_range = shift; - my @cmdoutput = `lsdef $node_range -i ip,mac 2>&1`; - my $rst = 0; - my $currentnode = ""; - my $ip = "NOIP"; + my $node_range = shift; + my @cmdoutput = `lsdef $node_range -i ip,mac -c 2>&1`; + my $rst = 0; my %nodecheckrst; - my $mac_line; - my @macs; foreach (@cmdoutput) { chomp($_); $_ =~ s/^\s+|\s+$//g; if ($_ =~ /^Error: Could not find an object named '(\w+)' .+/i) { - $currentnode = $1; - $nodecheckrst{$currentnode}{"error"} = "Could not find node definition"; - $rst = 1; - } elsif ($_ =~ /^\s*Object name: (\w+)/i) { - - # 'rst' is used to check whether the node process finished, 1 is finished. - $monitor_nodes{$1}{"rst"} = 0; - $currentnode = $1; - $ip = "NOIP"; - } elsif ($_ =~ /^ip=(.+)/i) { - if ($1) { - $ip = $1; + $nodecheckrst{$1}{"error"} = "Could not find node definition"; + } elsif ($_ =~ /(\S+):\s+mac=(.*)/i) { + my $node = $1; + my $mac = $2; + if ($mac) { + $nodecheckrst{$node}{"mac"} = $mac; + } else { + $nodecheckrst{$node}{"error"} = "Node $1 doesn't have MAC address"; } - } elsif ($_ =~ /^mac=(.+)/i) { - next unless ($1); - $mac_line = $1; - @macs = split(/\|/, $mac_line); - foreach my $mac (@macs) { - if ($mac =~ /\!\*NOIP\*/) { - $mac =~ s/\!\*NOIP\*//g; - $macmap{$mac}{"ip"} = "NOIP"; - $macmap{$mac}{"node"} = $currentnode; - } - else { - $macmap{$mac}{"ip"} = $ip; - $macmap{$mac}{"node"} = $currentnode; - } + } elsif ($_ =~ /(\S+):\s+ip=(.*)/i) { + my $node = $1; + my $ip = $2; + if ($ip) { + $nodecheckrst{$node}{"ip"} = $ip; } } } foreach my $node (keys %nodecheckrst) { - probe_utils->send_msg("$output", "d", "$node : $nodecheckrst{$node}{error}") if (exists($nodecheckrst{$node}{error})); + if (exists($nodecheckrst{$node}{error})) { + probe_utils->send_msg("$output", "d", "$node : $nodecheckrst{$node}{error}"); + $rst = 1; + next; + } + my $noerror=1; + my @macs = split(/\|/, $nodecheckrst{$node}{"mac"}); + foreach my $mac (@macs) { + + #[NOTE] don't support 2 adapters in the same network now. TODO + + if ($mac =~ /\!\*NOIP\*/) { + $mac =~ s/\!\*NOIP\*//g; + $macmap{$mac}{"ip"} = "NOIP"; + $macmap{$mac}{"node"} = $node; + } else { + $macmap{$mac}{"node"} = $node; + if($nodecheckrst{$node}{"ip"}){ + $macmap{$mac}{"ip"} = $nodecheckrst{$node}{"ip"}; + }else{ + my $nodeip = xCAT::NetworkUtils->getipaddr($node); + if($nodeip){ + $macmap{$mac}{"ip"} = $nodeip; + }else{ + $noerror = 0; + $rst = 1; + probe_utils->send_msg("$output", "f", "$node : can't be resolved to a IP address"); + } + } + } + } + + $monitor_nodes{$node}{"rst"} = 0 if($noerror); } unless (%monitor_nodes) { - probe_utils->send_msg("$output", "d", "There is no node to be monitored"); + probe_utils->send_msg("$output", "d", "There is no valid node to handle"); $rst = 1; } + return $rst; } @@ -116,7 +255,7 @@ sub check_noderange { Handle one line log come from dhcp log file Arguments: msg: one line http log - nics: target network interfaces + installnic: target network interfaces Returns: 0 : pass 1 : failed @@ -132,8 +271,8 @@ sub handle_dhcp_msg { if (exists $macmap{$mac}) { my $node = $macmap{$mac}{"node"}; - my $record = "Receive DHCPDISCOVER from [$node] $mac via $nic"; - probe_utils->send_msg("$output", "d", "$record"); + my $record = "Receive DHCPDISCOVER via $nic"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $record); } } elsif ($msg =~ /.+DHCPOFFER\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/i) { @@ -143,8 +282,8 @@ sub handle_dhcp_msg { if (exists $macmap{$mac}) { my $node = $macmap{$mac}{"node"}; - my $record = "Send DHCPOFFER on $ip back to [$node] $mac via $nic"; - probe_utils->send_msg("$output", "d", "$record"); + my $record = "Send DHCPOFFER on $ip back to $mac via $nic"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $record); } @@ -155,16 +294,10 @@ sub handle_dhcp_msg { my $nic = $4; if (exists $macmap{$mac}) { - my $node = $macmap{$mac}{"node"}; - my $record = "Receive DHCPREQUEST from [$node] $mac for $ip via $nic"; - probe_utils->send_msg("$output", "d", "$record"); + my $node = $macmap{$mac}{"node"}; + my $record = "Receive DHCPREQUEST from $mac for $ip via $nic"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $record); - - if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) { - my $warn_msg = "The ip of [$node] $mac from DHCP $ip is different with definition $macmap{$mac}{'ip'}."; - probe_utils->send_msg("$output", "w", "$warn_msg"); - push(@{ $rawdata{$node}{"history"} }, $warn_msg); - } } } elsif ($msg =~ /.+DHCPACK\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/) { my $ip = $1; @@ -173,19 +306,25 @@ sub handle_dhcp_msg { if (exists $macmap{$mac}) { my $node = $macmap{$mac}{"node"}; - my $record = "Send DHCPACK on $ip back to [$node] $mac via $nic"; - probe_utils->send_msg("$output", "d", "$record"); + my $record = "Send DHCPACK on $ip back to $mac via $nic"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $record); $ipnodemap{$ip} = $node; + push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "dhcp")); + + if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) { + my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different with the ip($macmap{$mac}{'ip'}) in node definition."; + probe_utils->send_msg("$output", "w", "$warn_msg") if ($monitor); + push(@{ $rawdata{$node}{"history"} }, $warn_msg); + } } } elsif ($msg =~ /.+BOOTREQUEST\s+from\s+(.+)\s+via\s+([^:]+)(.*)/) { my $mac = $1; my $nic = $2; - if (exists $macmap{$mac}) { my $node = $macmap{$mac}{"node"}; - my $record = "Receive BOOTREQUEST from [$node] $mac via $nic"; - probe_utils->send_msg("$output", "d", "$record"); + my $record = "Receive BOOTREQUEST from $mac via $nic"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $record); } } elsif ($msg =~ /.+BOOTREPLY\s+for\s+(.+)\s+to\s+.+(\w\w:\w\w:\w\w:\w\w:\w\w:\w\w).+via\s+(.+)/) { @@ -195,14 +334,15 @@ sub handle_dhcp_msg { if (exists $macmap{$mac}) { my $node = $macmap{$mac}{"node"}; - my $record = "Send BOOTREPLY on $ip back to [$node] $mac via $nic"; - probe_utils->send_msg("$output", "d", "$record"); + my $record = "Send BOOTREPLY on $ip back to $mac via $nic"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $record); $ipnodemap{$ip} = $node; + push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "dhcp")); if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) { - my $warn_msg = "The ip of [$node] $mac from DHCP $ip is different with definition $macmap{$mac}{'ip'}."; - probe_utils->send_msg("$output", "w", "$warn_msg"); + my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different with the ip($macmap{$mac}{'ip'}) in node definition."; + probe_utils->send_msg("$output", "w", "$warn_msg") if ($monitor); push(@{ $rawdata{$node}{"history"} }, $warn_msg); } } @@ -230,11 +370,11 @@ sub handle_tftp_msg { if ($msg =~ /RRQ\s+from\s+(.+)\s+filename\s+(.+)/i) { my $ip = $1; my $file = $2; - my $record = "[$ipnodemap{$ip}] Via TFTP $ip download $file"; - + my $record = "Via TFTP download $file"; if (exists($rawdata{"$ipnodemap{$ip}"})) { - probe_utils->send_msg("$output", "d", "$record"); + probe_utils->send_msg("$output", "d", "[$ipnodemap{$ip}] $record") if ($monitor); push(@{ $rawdata{ $ipnodemap{$ip} }{"history"} }, $record); + push(@{ $rawdata{ $ipnodemap{$ip} }{statehistory} }, $rawdata{ $ipnodemap{$ip} }{state}) if (reset_state(\$rawdata{ $ipnodemap{$ip} }{state}, "tftp")); } } } @@ -257,11 +397,12 @@ sub handle_http_msg { if ($msg =~ /(\d+\.\d+.\d+.\d+)\s.+GET\s+(.+)\s+HTTP.+/) { my $ip = $1; my $file = $2; - my $record = "[$ipnodemap{$ip}] Via HTTP $ip GET $file"; + my $record = "Via HTTP get $file"; if (exists($rawdata{"$ipnodemap{$ip}"})) { - probe_utils->send_msg("$output", "d", "$record"); + probe_utils->send_msg("$output", "d", "[$ipnodemap{$ip}] $record") if ($monitor); push(@{ $rawdata{ $ipnodemap{$ip} }{"history"} }, $record); + push(@{ $rawdata{ $ipnodemap{$ip} }{statehistory} }, $rawdata{ $ipnodemap{$ip} }{state}) if (reset_state(\$rawdata{ $ipnodemap{$ip} }{state}, "http")); } } return 0; @@ -286,37 +427,29 @@ sub handle_cluster_msg { my $msg; my $status; - if ($line =~ /.+\s+xcat:\s+(.+)\s+status:\s+(.+)\s+statustime:\s(.+)/) { - $node = $1; - $status = $2; + my @splitline = split(/\s+/, $line); + if (($splitline[4] =~ /^xcat/i) || ($splitline[5] =~ /^xcat/i)) { - if (exists($rawdata{$node})) { - my $record = "Receive from $node : status is $status"; - probe_utils->send_msg("$output", "d", "$record"); - push(@{ $rawdata{$node}{"history"} }, $record); - } + #log like: Aug 7 22:30:31 c910f02c01p09 xcat: c910f02c04p04 status: booted statustime: 08-07-2016 22:30:31 + if (($splitline[6] =~ /^status:$/i) && ($splitline[8] =~ /^statustime:$/)) { + $node = $splitline[5]; + $status = $splitline[7]; + if (exists($rawdata{$node})) { + my $record = "Node status is changed to $status"; + probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor); + push(@{ $rawdata{$node}{"history"} }, $record); - # When receive 'status is booted', check whether the $monitor_nodes{$node}{"status"} is installing. - # If so, the node has finished its os provision. - if (exists($rawdata{$node}) and ($status eq "booted")) { - if ($monitor_nodes{$node}{"status"} eq "installing") { - $record = "Node $node has finished it's os provision process"; - probe_utils->send_msg("$output", "o", "$record"); - push(@{ $rawdata{$node}{"history"} }, $record); - } else { - $record = "NO installing process detected for node $node"; - probe_utils->send_msg("$output", "f", "$record"); - push(@{ $rawdata{$node}{"history"} }, $record); + #one node finish deployment + if ($status eq "booted") { + $monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node})); + push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "booted")); + } + + if ($status eq "powering-on") { + push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "poweron")); + } } - $monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node})); - } elsif (exists($rawdata{$node}) and ($status eq "failed")) { - $monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node})); - probe_utils->send_msg("$output", "f", "Node $node has finished it's os provision process"); - push(@{ $rawdata{$node}{"history"} }, "Node $node os provision failed"); - } elsif (exists($rawdata{$node}) and ($status eq "installing")) { - # record 'installing' status, to check when receive 'booted' status - $monitor_nodes{$node}{"status"} = "installing"; } } return 0; @@ -350,10 +483,15 @@ sub handle_compute_msg { } else { $node = $ipnodemap{$sender}; } + if ($node ne "" && exists($rawdata{$node})) { - my $record = "Receive from $node : $msg"; - probe_utils->send_msg("$output", "d", "$record"); - push(@{ $rawdata{$node}{"history"} }, $record); + probe_utils->send_msg("$output", "d", "[$node] $msg") if ($monitor); + push(@{ $rawdata{$node}{"history"} }, $msg); + + #node start to run postscript or postbootscript + if ($msg =~ /Running postscript/) { + push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "postscript")); + } } } @@ -407,30 +545,113 @@ sub dump_history { print "$title\n"; foreach $node (keys %rawdata) { - my $line_num = 0; - my $http_num = 0; - my $length_http; - for (my $i = @{ $rawdata{$node}{"history"} } ; $i >= 0 ; $i--) { - if (${ $rawdata{$node}{"history"} }[$i] =~ /Via HTTP/) { - $length_http = $i; - last; + + if ($verbose) { + print "[$node]\n"; + + my $httphit = 0; + my @httptmp; + foreach my $line (@{ $rawdata{$node}{"history"} }) { + if ($line =~ /Via HTTP/) { + if ($httphit) { + shift @httptmp if ($#httptmp > 0); + push @httptmp, $line; + } else { + print "\t$line\n"; + $httphit = 1; + } + } else { + if ($#httptmp > -1) { + print "\tVia HTTP ..........\n"; + print "\t$_\n" foreach (@httptmp); + } + @httptmp = (); + $httphit = 0; + print "\t$line\n"; + } } } - foreach my $line (@{ $rawdata{$node}{"history"} }) { - # Print http message less than 10 lines - if ($line =~ /Via HTTP/) - { - if (($http_num <= 4) or ($length_http - $line_num <= 4)) { - probe_utils->send_msg("$output", "d", "\t$line"); + my @tmpnodestatehistory = @{ $rawdata{$node}{statehistory} }; + + #print "state history = @tmpnodestatehistory\n"; + + my %match_result; + my $procidx = 0; + my $newloop = 0; + my $notfirstloop = 0; + while (@tmpnodestatehistory) { + undef %match_result if ($notfirstloop); + $newloop = 0; + foreach my $type (keys %valid_process) { + if ($notfirstloop) { + $procidx = 1; } else { - probe_utils->send_msg("$output", "d", "\t......") if ($http_num == 5); + $procidx = 0; } - $http_num++; - } else { - probe_utils->send_msg("$output", "d", "\t$line"); + my $proclen = scalar(@{ $valid_process{$type}{process} }); + my $i; + for ($i = 0 ; $i < scalar(@tmpnodestatehistory) ; $i++) { + if ($procidx < $proclen) { + if ($tmpnodestatehistory[$i] == $valid_process{$type}{process}[$procidx]) { + ++$procidx; + } else { + --$procidx if (notfirstloop); + push @{ $match_result{$procidx} }, $type; + last; + } + } else { + splice(@tmpnodestatehistory, 0, $i); + $newloop = 1; + $notfirstloop = 1; + last; + } + } + if ($i == scalar(@tmpnodestatehistory)) { + push @{ $match_result{$procidx} }, $type; + next; + } + last if ($newloop); + } + last if (!$newloop); + } + + #print "------------result---------------\n"; + #print Dumper %match_result; + + my $max_match = 0; + foreach my $key (keys %match_result) { + $max_match = $key if ($key > $max_match); + } + + my $formatprefix; + if ($max_match == 0) { + my $statelist = ""; + for (my $i = 0 ; $i < scalar(@{ $rawdata{$node}{statehistory} }) ; $i++) { + $statelist .= "$state_set_reverse{$rawdata{$node}{statehistory}[$i]} "; + } + probe_utils->send_msg("$output", "f", "[$node] deployment failed"); + probe_utils->send_msg("$output", "d", "\t$node did unknown process, state change history is $statelist"); + } elsif (scalar(@{ $match_result{$max_match} }) > 1) { + probe_utils->send_msg("$output", "f", "[$node] deployment failed"); + probe_utils->send_msg("$output", "d", "\tThere are more than one possible process satisfy $node situation"); + foreach my $proc (@{ $match_result{$max_match} }) { + if ($valid_process{$proc}{type} eq "deploy") { + probe_utils->send_msg("$output", "d", "\tpossible process \"deploy\", pass $state_set_reverse{$valid_process{$proc}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$proc}{process}[$max_match]} stage"); + } elsif ($valid_process{$proc}{type} eq "reboot") { + probe_utils->send_msg("$output", "d", "\tpossible process \"reboot\", pass $state_set_reverse{$valid_process{$proc}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$proc}{process}[$max_match]} stage"); + } + } + } else { + if (($valid_process{ $match_result{$max_match}[0] }{type} eq "deploy") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] == $state_set{done})) { + probe_utils->send_msg("$output", "o", "[$node] deployment completed"); + } elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "deploy") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] != $state_set{done})) { + probe_utils->send_msg("$output", "f", "[$node] deployment failed, pass $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match]} stage") + } elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "reboot") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] == $state_set{done})) { + probe_utils->send_msg("$output", "f", "[$node] reboot completed, without deployment process"); + } elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "reboot") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] != $state_set{done})) { + probe_utils->send_msg("$output", "f", "[$node] reboot failed,without deployment process, stop at $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match]} stage"); } - $line_num++; } } } @@ -451,29 +672,6 @@ sub do_monitor { $terminal = 1; }; - my $msg = "All pre_defined nodes are valid"; - my $rc = check_noderange($noderange); - if ($rc) { - probe_utils->send_msg("$output", "f", $msg); - $rst = 1; - unless (%monitor_nodes) { - return $rst; - } - } else { - probe_utils->send_msg("$output", "o", $msg); - } - - if (!$nics) { - my $masteripinsite = `tabdump site | awk -F',' '/^"master",/ { gsub(/"/, "", \$2) ; print \$2 }'`; - chomp($masteripinsite); - $nics = `ip addr |grep -B2 $masteripinsite|awk -F" " '/mtu/{gsub(/:/,"",\$2); print \$2}'`; - chomp($nics); - if (!$nics) { - probe_utils->send_msg("$output", "f", "The value of master in site table is $masteripinsite, can't get corresponding network interface"); - return 1; - } - } - my $rst = 0; my $startline = "------------------------------------------------------------- @@ -514,6 +712,7 @@ Start to capture every message during os provision process...... if (!-e "$clusterlog") { probe_utils->send_msg("$output", "w", "$clusterlog doesn't exist"); + probe_utils->send_msg("$output", "i", "If sles11 or xCAT2.11.x is using, please ignore above warning"); } else { if (!($clusterpid = open(CLUSTERLOGFILE, "tail -f -n 0 $clusterlog 2>&1 |"))) { probe_utils->send_msg("$output", "f", "Can't open $clusterlog to get logs"); @@ -532,6 +731,7 @@ Start to capture every message during os provision process...... } if (!-e "$computelog") { probe_utils->send_msg("$output", "w", "$computelog doesn't exist"); + probe_utils->send_msg("$output", "i", "If sles11 or xCAT2.11.x is using, please ignore above warning"); } else { if (!($computerpid = open(COMPUTERFILE, "tail -f -n 0 $computelog 2>&1 |"))) { probe_utils->send_msg("$output", "f", "Can't open $computelog to get logs"); @@ -551,21 +751,15 @@ Start to capture every message during os provision process...... my @hdls; my $hdl; - my @candidate_svr_hostname_inlog; - my $svr_hostname_short = `hostname -s`; - chomp($svr_hostname_short); - my $svr_hostname_domain = `hostname -d`; - chomp($svr_hostname_domain); - push(@candidate_svr_hostname_inlog, $svr_hostname_short); - push(@candidate_svr_hostname_inlog, "$svr_hostname_short.$svr_hostname_domain"); - + my $starttime = time(); + $monitor = 1; for (; ;) { if (@hdls = $select->can_read(0)) { foreach $hdl (@hdls) { if ($hdl == \*VARLOGMSGFILE) { chomp($line = ); my @tmp = split(/\s+/, $line); - if ($tmp[4] =~ /dhcpd:/i && $line =~ /$nics/) { + if ($tmp[4] =~ /dhcpd:/i && $line =~ /$installnic/) { handle_dhcp_msg("$line"); } elsif ($tmp[4] =~ /in.tftpd/i) { handle_tftp_msg("$line"); @@ -596,7 +790,13 @@ Start to capture every message during os provision process...... probe_utils->send_msg("$output", "o", "All nodes need to monitor have finished os provision process"); } last; - } sleep 0.01; + } + + if (time() - $starttime > ($maxwaittime * 60)) { + probe_utils->send_msg("$output", "i", "$maxwaittime minutes are expired, stop monitor"); + last; + } + sleep 0.01; } &dump_history; } @@ -612,6 +812,177 @@ Start to capture every message during os provision process...... return $rst; } +sub get_valid_logs { + my $ref_timestamp = shift; + my $year = shift; + my $epoch_seconds_of_now = shift; + my $bthistory_ref = shift; + my $nics = "eth0"; + my @orglogfilelist = ("/var/log/xcat/cluster.log", + "/var/log/messages", + "/var/log/xcat/computes.log"); + my $httplog; + if (-e "/var/log/httpd/access_log") { + $httplog = "/var/log/httpd/access_log"; + } elsif (-e "/var/log/apache2/access_log") { + $httplog = "/var/log/apache2/access_log"; + } elsif (-e "/var/log/apache2/access.log") { + $httplog = "/var/log/apache2/access.log"; + } + push @orglogfilelist, $httplog; + + foreach my $f (@orglogfilelist) { + my $filename = basename("$f"); + $filename =~ s/(.+)\.(.+)/$1/g; + my $path_only = dirname("$f"); + my @rotatefiles; + + #TODO using opendir to refine below code + my @alltargetfiles = `ls -lt $path_only |awk -F" " '/ $filename/ {print \$9}'`; + foreach my $samenamefile (@alltargetfiles) { + chomp($samenamefile); + push @rotatefiles, "$path_only/$samenamefile"; + } + + my $ishttplog = 0; + $ishttplog = 1 if ($filename =~ /access[\._]log/); + + foreach my $file (@rotatefiles) { + my $fd; + my $filetype = `file $file 2>&1`; + chomp($filetype); + if ($filetype =~ /ASCII text/) { + if (!open($fd, "$file")) { + print "open $files failed\n"; + next; + } + } else { + + #TODO handle compression files + } + + #print "--->load $file\n"; + my $line; + my $historynum = 0; + last unless ($line = <$fd>); + chomp($line); + my $needrotate = 0; + my $logindex = 0; + my @splitline = split(/\s+/, $line); + my $timestamp; + my $timestampepoch; + + if ($ishttplog) { + $splitline[3] =~ s/^\[(.+)/$1/g; + $timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]); + } else { + $timestamp = join(" ", @splitline[ 0 .. 2 ]); + $timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now); + } + if ($ref_timestamp <= $timestampepoch) { + $needrotate = 1; + } else { + seek($fd, 0, 2); + my $tail = tell; + my $head = 0; + my $lasttail = $tail; + my $i = 0; + while ($head <= $tail) { + my $middle = int(($tail - $head) / 2) + $head; + seek($fd, $middle, 0); + $line = <$fd>; + $middle += length($line); + last unless ($line = <$fd>); + @splitline = split(/\s+/, $line); + if ($ishttplog) { + $splitline[3] =~ s/^\[(.+)/$1/g; + $timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]); + } else { + $timestamp = join(" ", @splitline[ 0 .. 2 ]); + $timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now); + } if ($ref_timestamp == $timestampepoch) { + $historynum = $middle; + last; + } elsif ($ref_timestamp < $timestampepoch) { + $tail = $middle; + last if ($tail == $lasttail); + $lasttail = $tail; + } else { + $head = $middle; + } + } + $historynum = $head unless ($historynum); + } + + seek($fd, $historynum, 0); + while (<$fd>) { + chomp; + @splitline = split(/\s+/, $_); + if ($ishttplog) { + $splitline[3] =~ s/^\[(.+)/$1/g; + $timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]); + } else { + $timestamp = join(" ", @splitline[ 0 .. 2 ]); + $timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now); + } + + if (($splitline[4] =~ /dhcpd:/i && $_ =~ /$nics/) + || ($splitline[4] =~ /in.tftpd/i) + || (($splitline[4] =~ /^xcat/i) || ($splitline[5] =~ /^xcat/i)) + || ($splitline[5] =~ /GET/ && $splitline[7] =~ /HTTP/)) { + my $log = "$timestampepoch $filename$logindex $_"; + $logindex++; + push @$bthistory_ref, $log; + } + } + close($fd); + last unless ($needrotate); + } + } + + #sort logs depending on time + my @sort_ht = sort(@$bthistory_ref); + for ($i = 0 ; $i <= $#sort_ht ; $i++) { + $sort_ht[$i] =~ s/^(\d+) (\S+) (.+)/$1 $3/g; + } + + #delete duplicate logs + my %count; + @$bthistory_ref = grep { ++$count{$_} < 2; } @sort_ht; +} + +sub do_replay { + my $ref_timestamp = shift; + + my $timestr = scalar(localtime($ref_timestamp)); + print "Start to search logs after '$timestr', please waiting for a while.............\n"; + + my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time()); + my $epoch_seconds_of_now = time(); + my @bthistory; + get_valid_logs($ref_timestamp, $year, $epoch_seconds_of_now, \@bthistory); + + foreach my $line (@bthistory) { + $line =~ s/(\d+) (.+)/$2/g; + my @tmp = split(/\s+/, $line); + if ($tmp[4] =~ /dhcpd:/i && $line =~ /$nics/) { + handle_dhcp_msg("$line"); + } elsif ($tmp[4] =~ /in.tftpd/i) { + handle_tftp_msg("$line"); + } elsif (($tmp[4] =~ /^xcat/i) || ($tmp[5] =~ /^xcat/i)) { + if (grep(/^$tmp[3]$/, @candidate_svr_hostname_inlog)) { + handle_cluster_msg("$line"); + } else { + handle_compute_msg("$line"); + } + } elsif ($tmp[5] =~ /GET/ && $tmp[7] =~ /HTTP/) { + handle_http_msg("$line"); + } + } + &dump_history; + return 0; +} + #------------------------------------- # main process #------------------------------------- @@ -619,6 +990,8 @@ if ( !GetOptions("--help|h|?" => \$help, "T" => \$test, "V" => \$verbose, + "t" => \$maxwaittime, + "r=s" => \$replaylog, "n=s" => \$noderange)) { probe_utils->send_msg("$output", "f", "Invalid parameter for $program_name"); @@ -641,11 +1014,60 @@ if ($test) { } unless ($noderange) { - probe_utils->send_msg("$output", "f", "Option -n is required"); + probe_utils->send_msg("$output", "f", "A noderange is needed"); probe_utils->send_msg("$output", "d", "$::USAGE"); exit 1; } +my $epoch_starttime = time(); +if ($replaylog) { + if ($replaylog =~ /(\d+)h(\d+)m/i) { + $epoch_starttime -= ($1 * 3600 + $2 * 60) + } elsif ($replaylog =~ /^(\d+)h*$/i) { + $epoch_starttime -= $1 * 3600; + } elsif ($replaylog =~ /^(\d+)m$/) { + $epoch_starttime -= $1 * 60; + } else { + probe_utils->send_msg("$output", "f", "Unsupport time format for replay history log"); + print "$::USAGE"; + exit 1; + } +} + +my $msg = "All pre_defined nodes are valid"; +my $rc = check_noderange($noderange); +if ($rc) { + probe_utils->send_msg("$output", "f", $msg); + $rst = 1; +} else { + probe_utils->send_msg("$output", "o", $msg); +} + +unless ($installnic) { + my $masteripinsite = `tabdump site | awk -F',' '/^"master",/ { gsub(/"/, "", \$2) ; print \$2 }'`; + chomp($masteripinsite); + $installnic = `ip addr |grep -B2 $masteripinsite|awk -F" " '/mtu/{gsub(/:/,"",\$2); print \$2}'`; + chomp($installnic); + if (!$installnic) { + probe_utils->send_msg("$output", "f", "The value of master in site table is $masteripinsite, can't get corresponding network interface"); + $rst = 1; + } else { + probe_utils->send_msg("$output", "i", "The installation network interface is $installnic"); + } +} + +exit $rst if ($rst); + +foreach my $node (keys %monitor_nodes) { + $rawdata{$node}{state} = $state_set{unknown}; + push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}); +} + +if ($replaylog) { + $rst = do_replay($epoch_starttime); + exit $rst; +} + $rst = do_monitor(); exit $rst;