xcat-core/xCAT-probe/subcmds/osdeploy

#! /usr/bin/perl
# IBM(c) 2016 EPL license http://www.eclipse.org/legal/epl-v10.html

BEGIN { $::XCATROOT = $ENV{'XCATROOT'} ? $ENV{'XCATROOT'} : -d '/opt/xcat' ? '/opt/xcat' : '/usr'; }

use lib "$::XCATROOT/probe/lib/perl";

use probe_utils;
use xCAT::NetworkUtils;
use File::Basename;
use IO::Select;
use Time::Local;
use Data::Dumper;
use Getopt::Long qw(:config no_ignore_case);

my $program_name = basename("$0");
my $help;
my $test;
my $maxwaittime = 60;         #unit is minute
my $output      = "stdout";
my $verbose     = 0;
my $rst         = 0;
my $noderange;
my $terminal = 0;
my $installnic;
my $monitor = 0;
my $replaylog;                ##used by feature replay deploymen log

my %rawdata;

#-%rawdata structure-------
# $rawdata{nodename}{"history"}       #array, the log history of current node
# $rawdata{nodename}{"state"}         #the latest status of current node, used for State Machine
# $rawdata{nodename}{"statehistory"}  #array, the history status of current node, used for State Machine
#--------------------------

my %macmap;

#-%macmap structure-------
# $macmap{mac_addr}{"ip"}="x.x.x.x"
# $macmap{mac_addr}{"node"}="nodename"
#-------------------------

my %ipnodemap;

#-%ipnodemap structure-------
# $ipnodemap{ip_addr}="nodename"
#---------------------------

my %monitor_nodes;

#- %monitor_nodes structure-------
# $monitor_nodes{nodename}{"status"}  #useless now
# $monitor_nodes{nodename}{"rst"}
#---------------------------------

# provision state machine
my %state_set = (
    "unknown"                  => 0,
    "server_reboot"             => 1,
    "loaded_kernel_and_initrd" => 2,
    "kernel_and_initrd_got_ip" => 3,
    "install_os_packages"      => 4,
    "run_postscript"           => 5,
    "run_postbootscript"       => 6,
    "done"                     => 7,
);

my %state_set_reverse = (
    "0" => "unknown",
    "1" => "server_reboot",
    "2" => "loaded_kernel_and_initrd",
    "3" => "kernel_and_initrd_got_ip",
    "4" => "install_os_packages",
    "5" => "run_postscript",
    "6" => "run_postbootscript",
    "7" => "done",
);

my %valid_process;
$valid_process{1}{process} = [ $state_set{unknown}, $state_set{done} ];
$valid_process{1}{type} = "reboot";

$valid_process{2}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{done} ];
$valid_process{2}{type} = "reboot";

$valid_process{3}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ];
$valid_process{3}{type} = "reboot";

$valid_process{4}{process} = [ $state_set{unknown}, $state_set{server_reboot}, $state_set{loaded_kernel_and_initrd}, $state_set{kernel_and_initrd_got_ip}, $state_set{install_os_packages}, $state_set{run_postscript}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ];
$valid_process{4}{type} = "deploy";

$valid_process{5}{process} = [ $state_set{unknown}, $state_set{loaded_kernel_and_initrd}, $state_set{kernel_and_initrd_got_ip}, $state_set{install_os_packages}, $state_set{run_postscript}, $state_set{server_reboot}, $state_set{run_postbootscript}, $state_set{done} ];
$valid_process{5}{type} = "deploy";

sub reset_state {
    my $state_ref  = shift;
    my $condition  = shift;
    my $reset_flag = 1;

    if ($$state_ref == $state_set{unknown} && $condition eq "dhcp") {
        $$state_ref = $state_set{server_reboot};
    } elsif ($$state_ref == $state_set{unknown} && $condition eq "booted") {
        $$state_ref = $state_set{done};
    } elsif (($$state_ref == $state_set{unknown} && $condition eq "tftp") || ($$state_ref == $state_set{unknown} && $condition eq "http")) {
        $$state_ref = $state_set{loaded_kernel_and_initrd};
    } elsif ($$state_ref == $state_set{server_reboot} && $condition eq "booted") {
        $$state_ref = $state_set{done};
    } elsif ($$state_ref == $state_set{server_reboot} && $condition eq "postscript") {
        $$state_ref = $state_set{run_postbootscript};
    } elsif (($$state_ref == $state_set{server_reboot} && $condition eq "tftp") || ($$state_ref == $state_set{server_reboot} && $condition eq "http")) {
        $$state_ref = $state_set{loaded_kernel_and_initrd};
    } elsif ($$state_ref == $state_set{loaded_kernel_and_initrd} && $condition eq "dhcp") {
        $$state_ref = $state_set{kernel_and_initrd_got_ip};
    } elsif ($$state_ref == $state_set{kernel_and_initrd_got_ip} && $condition eq "http") {
        $$state_ref = $state_set{install_os_packages};
    } elsif ($$state_ref == $state_set{install_os_packages} && $condition eq "postscript") {
        $$state_ref = $state_set{run_postscript};
    } elsif ($$state_ref == $state_set{run_postscript} && $condition eq "dhcp") {
        $$state_ref = $state_set{server_reboot};
    } elsif ($$state_ref == $state_set{run_postbootscript} && $condition eq "booted") {
        $$state_ref = $state_set{done};
    } elsif ($$state_ref == $state_set{done} && $condition eq "dhcp") {
        $$state_ref = $state_set{server_reboot};
    } elsif ($$state_ref == $state_set{done} && $condition eq "poweron") {
        $$state_ref = $state_set{server_reboot};
    } else {
        $reset_flag = 0;
    }

    return $reset_flag;
}


my @candidate_svr_hostname_inlog;
my $svr_hostname_short = `hostname -s`;
chomp($svr_hostname_short);
my $svr_hostname_domain = `hostname -d`;
chomp($svr_hostname_domain);
push(@candidate_svr_hostname_inlog, $svr_hostname_short);
push(@candidate_svr_hostname_inlog, "$svr_hostname_short.$svr_hostname_domain");


$::USAGE = "Usage:
    $program_name -h
    $program_name -T
    $program_name -n <node_range> [-V]
    $program_name -n <node_range> -r <xxhxxm> [-V]

Description:
    Probe for OS provision process. Realtime monitor or replay history of OS provision process.
    If realtime monitor, run this before 'rpower' node.
    Currently, hierarchial structure is not supported.

Options:
    -h : Get usage information of $program_name
    -T : Verify if $program_name can work, reserved option for probe framework
    -V : Output more information for debug
    -n : The range of nodes for monitor or replay log.
    -t : The maximum time in minutes to wait when doing monitor, default is 60.
    -r : Replay history log for probe provisioniong. Input a start time when probe should begin.
         Supported time formats are xxhxxm, xxh, or xxm. If units not specified, hour will be used by default.
";

#------------------------------------------

=head3
    Description:
        Check if all nodes are valid
    Arguments:
        node_range: node range
    Returns:
        0 : pass
        1 : failed
=cut

#------------------------------------------
sub check_noderange {
    my $node_range = shift;
    my @cmdoutput  = `lsdef $node_range -i ip,mac -c 2>&1`;
    my $rst        = 0;
    my %nodecheckrst;

    foreach (@cmdoutput) {
        chomp($_);
        $_ =~ s/^\s+|\s+$//g;
        if ($_ =~ /^Error: Could not find an object named '(\w+)' .+/i) {
            $nodecheckrst{$1}{"error"} = "Could not find node definition";
        } elsif ($_ =~ /(\S+):\s+mac=(.*)/i) {
            my $node = $1;
            my $mac  = $2;
            if ($mac) {
                $nodecheckrst{$node}{"mac"} = $mac;
            } else {
                $nodecheckrst{$node}{"error"} = "Node $1 doesn't have MAC address";
            }
        } elsif ($_ =~ /(\S+):\s+ip=(.*)/i) {
            my $node = $1;
            my $ip   = $2;
            if ($ip) {
                $nodecheckrst{$node}{"ip"} = $ip;
            }
        }
    }

    foreach my $node (keys %nodecheckrst) {
        if (exists($nodecheckrst{$node}{error})) {
            probe_utils->send_msg("$output", "d", "$node : $nodecheckrst{$node}{error}");
            $rst = 1;
            next;
        }
        my $noerror=1;
        my @macs = split(/\|/, $nodecheckrst{$node}{"mac"});
        foreach my $mac (@macs) {

            #[NOTE] don't support 2 adapters in the same network now. TODO

            if ($mac =~ /\!\*NOIP\*/) {
                $mac =~ s/\!\*NOIP\*//g;
                $macmap{$mac}{"ip"}   = "NOIP";
                $macmap{$mac}{"node"} = $node;
            } else {
                $macmap{$mac}{"node"} = $node;
                if($nodecheckrst{$node}{"ip"}){
                    $macmap{$mac}{"ip"} = $nodecheckrst{$node}{"ip"};
                }else{
                    my $nodeip = xCAT::NetworkUtils->getipaddr($node);
                    if($nodeip){
                        $macmap{$mac}{"ip"} = $nodeip;
                    }else{
                        $noerror = 0;
                        $rst = 1;
                        probe_utils->send_msg("$output", "f", "$node : can't be resolved to an IP address");
                    }
                }
            }
        }

        $monitor_nodes{$node}{"rst"} = 0 if($noerror);
    }

    unless (%monitor_nodes) {
        probe_utils->send_msg("$output", "d", "There is no valid node to handle");
        $rst = 1;
    }

    return $rst;
}

#------------------------------------------

=head3
    Description:
       Handle one line log come from dhcp log file
    Arguments:
       msg: one line http log
       installnic: target network interfaces
    Returns:
        0 : pass
        1 : failed
=cut

#------------------------------------------
sub handle_dhcp_msg {
    my $msg = shift;

    if ($msg =~ /.+DHCPDISCOVER\s+from\s+(.+)\s+via\s+([^:]+)(.*)/i) {
        my $mac = $1;
        my $nic = $2;

        if (exists $macmap{$mac}) {
            my $node   = $macmap{$mac}{"node"};
            my $record = "Receive DHCPDISCOVER via $nic";
            probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $record);
        }
    } elsif ($msg =~ /.+DHCPOFFER\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/i) {
        my $ip  = $1;
        my $mac = $2;
        my $nic = $3;

        if (exists $macmap{$mac}) {
            my $node   = $macmap{$mac}{"node"};
            my $record = "Send DHCPOFFER on $ip back to $mac via $nic";
            probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $record);
        }

    } elsif ($msg !~ /unknown lease/ && $msg !~ /ignored/ && $msg =~ /.+DHCPREQUEST\s+for\s+(.+)\s\((.+)\)\s+from\s+(.+)\s+via\s+(.+)/) {
        my $ip     = $1;
        my $server = $2;
        my $mac    = $3;
        my $nic    = $4;

        if (exists $macmap{$mac}) {
            my $node   = $macmap{$mac}{"node"};
            my $record = "Receive DHCPREQUEST from $mac for $ip via $nic";
            probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $record);
        }
    } elsif ($msg =~ /.+DHCPACK\s+on\s+(.+)\s+to\s+(.+)\s+via\s+(.+)/) {
        my $ip  = $1;
        my $mac = $2;
        my $nic = $3;

        if (exists $macmap{$mac}) {
            my $node   = $macmap{$mac}{"node"};
            my $record = "Send DHCPACK on $ip back to $mac via $nic";
            probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $record);
            $ipnodemap{$ip} = $node;
            push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "dhcp"));

            if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
                my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different from the ip($macmap{$mac}{'ip'}) in node definition.";
                probe_utils->send_msg("$output", "w", "$warn_msg") if ($monitor);
                push(@{ $rawdata{$node}{"history"} }, $warn_msg);
            }
        }
    } elsif ($msg =~ /.+BOOTREQUEST\s+from\s+(.+)\s+via\s+([^:]+)(.*)/) {
        my $mac = $1;
        my $nic = $2;
        if (exists $macmap{$mac}) {
            my $node   = $macmap{$mac}{"node"};
            my $record = "Receive BOOTREQUEST from $mac via $nic";
            probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $record);
        }
    } elsif ($msg =~ /.+BOOTREPLY\s+for\s+(.+)\s+to\s+.+(\w\w:\w\w:\w\w:\w\w:\w\w:\w\w).+via\s+(.+)/) {
        my $ip  = $1;
        my $mac = $2;
        my $nic = $3;

        if (exists $macmap{$mac}) {
            my $node   = $macmap{$mac}{"node"};
            my $record = "Send BOOTREPLY on $ip back to $mac via $nic";
            probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $record);
            $ipnodemap{$ip} = $node;
            push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "dhcp"));

            if ($macmap{$mac}{"ip"} != "NOIP" and $macmap{$mac}{"ip"} != $ip) {
                my $warn_msg = "The ip($ip) assigned to $mac via DHCP is different from the ip($macmap{$mac}{'ip'}) in node definition.";
                probe_utils->send_msg("$output", "w", "$warn_msg") if ($monitor);
                push(@{ $rawdata{$node}{"history"} }, $warn_msg);
            }
        }
    }

    return 0;
}

#------------------------------------------

=head3
    Description:
       Handle one line log come from tftp log file
    Arguments:
       msg: one line tftp log
    Returns:
        0 : pass
        1 : failed
=cut

#------------------------------------------
sub handle_tftp_msg {
    my $msg = shift;

    if ($msg =~ /RRQ\s+from\s+(.+)\s+filename\s+(.+)/i) {
        my $ip     = $1;
        my $file   = $2;
        my $record = "Via TFTP download $file";
        if (exists($rawdata{"$ipnodemap{$ip}"})) {
            probe_utils->send_msg("$output", "d", "[$ipnodemap{$ip}] $record") if ($monitor);
            push(@{ $rawdata{ $ipnodemap{$ip} }{"history"} }, $record);
            push(@{ $rawdata{ $ipnodemap{$ip} }{statehistory} }, $rawdata{ $ipnodemap{$ip} }{state}) if (reset_state(\$rawdata{ $ipnodemap{$ip} }{state}, "tftp"));
        }
    }
}

#------------------------------------------

=head3
    Description:
       Handle one line log come from http log file
    Arguments:
       msg: one line http log
    Returns:
        0 : pass
        1 : failed
=cut

#------------------------------------------
sub handle_http_msg {
    my $msg = shift;
    if ($msg =~ /(\d+\.\d+.\d+.\d+)\s.+GET\s+(.+)\s+HTTP.+/) {
        my $ip     = $1;
        my $file   = $2;
        my $record = "Via HTTP get $file";

        if (exists($rawdata{"$ipnodemap{$ip}"})) {
            probe_utils->send_msg("$output", "d", "[$ipnodemap{$ip}] $record") if ($monitor);
            push(@{ $rawdata{ $ipnodemap{$ip} }{"history"} }, $record);
            push(@{ $rawdata{ $ipnodemap{$ip} }{statehistory} }, $rawdata{ $ipnodemap{$ip} }{state}) if (reset_state(\$rawdata{ $ipnodemap{$ip} }{state}, "http"));
        }
    }
    return 0;
}

#------------------------------------------

=head3
    Description:
       Handle one line log come from cluster.log
    Arguments:
       msg: one line log
    Returns:
        0 : pass
        1 : failed
=cut

#------------------------------------------
sub handle_cluster_msg {
    my $line = shift;
    my $node = "";
    my $msg;
    my $status;

    my @splitline = split(/\s+/, $line);
    if (($splitline[4] =~ /^xcat/i) || ($splitline[5] =~ /^xcat/i)) {

        #log like: Aug  7 22:30:31 c910f02c01p09 xcat: c910f02c04p04 status: booted statustime: 08-07-2016 22:30:31
        if (($splitline[6] =~ /^status:$/i) && ($splitline[8] =~ /^statustime:$/)) {
            $node   = $splitline[5];
            $status = $splitline[7];
            if (exists($rawdata{$node})) {
                my $record = "Node status is changed to $status";
                probe_utils->send_msg("$output", "d", "[$node] $record") if ($monitor);
                push(@{ $rawdata{$node}{"history"} }, $record);

                #one node finish deployment
                if ($status eq "booted") {
                    $monitor_nodes{$node}{"rst"} = 1 if (defined($monitor_nodes{$node}));
                    push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "booted"));
                }

                if ($status eq "powering-on") {
                    push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "poweron"));
                }
            }

        }
    }
    return 0;
}

#-----------------------------------------

=head3
    Description:
       Handle one line log come from computes.log
    Arguments:
       msg: one line compute log
    Returns:
        0 : pass
        1 : failed
=cut

#-----------------------------------------
sub handle_compute_msg {
    my $line = shift;
    my $sender;
    my $node;
    my $msg;

    if ($line =~ /.+\d{2}:\d{2}:\d{2}\s+(.+)\s+(xcat.+)/i) {
        $sender = $1;
        $msg    = $2;

        if (!xCAT::NetworkUtils->isIpaddr($sender)) {
            $node = $sender;
        } else {
            $node = $ipnodemap{$sender};
        }

        if ($node ne "" && exists($rawdata{$node})) {
            probe_utils->send_msg("$output", "d", "[$node] $msg") if ($monitor);
            push(@{ $rawdata{$node}{"history"} }, $msg);

            #node start to run postscript or postbootscript
            if ($msg =~ /Running postscript/) {
                push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state}) if (reset_state(\$rawdata{$node}{state}, "postscript"));
            }
        }
    }

    return 0;
}

#------------------------------------------

=head3
    Description:
       Test if all nodes have finished job
    Arguments:
       One golble attribute %monitor_nodes;
    Returns:
       1: all nodes finished
       0: not all nodes finished
=cut

#------------------------------------------
sub all_monitor_node_done {
    my $done = 1;

    foreach my $node (keys %monitor_nodes) {
        if ($monitor_nodes{$node}{"rst"} == 0) {
            $done = 0;
            last;
        }
    }

    return $done;
}

#------------------------------------------

=head3
    Description:
       Dump monitor history,  categorised by mac address.
    Arguments:
       NULL
    Returns:
=cut

#------------------------------------------
sub dump_history {

    my $title = "
=============================================================
=  The summary of os provision:
=============================================================
";
    print "$title\n";

    foreach $node (keys %rawdata) {

        if ($verbose) {
            print "[$node]\n";

            my $httphit = 0;
            my @httptmp;
            foreach my $line (@{ $rawdata{$node}{"history"} }) {
                if ($line =~ /Via HTTP/) {
                    if ($httphit) {
                        shift @httptmp if ($#httptmp > 0);
                        push @httptmp, $line;
                    } else {
                        print "\t$line\n";
                        $httphit = 1;
                    }
                } else {
                    if ($#httptmp > -1) {
                        print "\tVia HTTP ..........\n";
                        print "\t$_\n" foreach (@httptmp);
                    }
                    @httptmp = ();
                    $httphit = 0;
                    print "\t$line\n";
                }
            }
            my $statelist = "";
            for (my $i = 0 ; $i < scalar(@{ $rawdata{$node}{statehistory} }) ; $i++) {
                $statelist .= "$state_set_reverse{$rawdata{$node}{statehistory}[$i]} ";
            }
            probe_utils->send_msg("$output", "d", "[$node] state history: $statelist");
        }

        my @tmpnodestatehistory = @{ $rawdata{$node}{statehistory} };

        #print "state history = @tmpnodestatehistory\n";

        my %match_result;
        my $procidx      = 0;
        my $newloop      = 0;
        my $notfirstloop = 0;
        while (@tmpnodestatehistory) {
            undef %match_result if ($notfirstloop);
            $newloop = 0;
            foreach my $type (keys %valid_process) {
                if ($notfirstloop) {
                    $procidx = 1;
                } else {
                    $procidx = 0;
                }
                my $proclen = scalar(@{ $valid_process{$type}{process} });
                my $i;
                for ($i = 0 ; $i < scalar(@tmpnodestatehistory) ; $i++) {
                    if ($procidx < $proclen) {
                        if ($tmpnodestatehistory[$i] == $valid_process{$type}{process}[$procidx]) {
                            ++$procidx;
                        } else {
                            --$procidx if (notfirstloop);
                            push @{ $match_result{$procidx} }, $type;
                            last;
                        }
                    } else {
                        splice(@tmpnodestatehistory, 0, $i);
                        $newloop      = 1;
                        $notfirstloop = 1;
                        last;
                    }
                }
                if ($i == scalar(@tmpnodestatehistory)) {
                    push @{ $match_result{$procidx} }, $type;
                    next;
                }
                last if ($newloop);
            }
            last if (!$newloop);
        }

       #print "------------result---------------\n";
       #print Dumper %match_result;

        my $max_match = 0;
        foreach my $key (keys %match_result) {
            $max_match = $key if ($key > $max_match);
        }

        my $formatprefix;
        if ($max_match == 0) {
            my $statelist = "";
            for (my $i = 0 ; $i < scalar(@{ $rawdata{$node}{statehistory} }) ; $i++) {
                $statelist .= "$state_set_reverse{$rawdata{$node}{statehistory}[$i]} ";
            }
            probe_utils->send_msg("$output", "f", "[$node] deployment failed");
            probe_utils->send_msg("$output", "d", "\t$node did unknown process, state change history is $statelist");
        } elsif (scalar(@{ $match_result{$max_match} }) > 1) {
            probe_utils->send_msg("$output", "f", "[$node] deployment failed");
            probe_utils->send_msg("$output", "d", "\tThere are more than one possible process satisfy $node situation");
            foreach my $proc (@{ $match_result{$max_match} }) {
                if ($valid_process{$proc}{type} eq "deploy") {
                    probe_utils->send_msg("$output", "d", "\tpossible process \"deploy\", pass $state_set_reverse{$valid_process{$proc}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$proc}{process}[$max_match]} stage");
                } elsif ($valid_process{$proc}{type} eq "reboot") {
                    probe_utils->send_msg("$output", "d", "\tpossible process \"reboot\", pass $state_set_reverse{$valid_process{$proc}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$proc}{process}[$max_match]} stage");
                }
            }
        } else {
            if (($valid_process{ $match_result{$max_match}[0] }{type} eq "deploy") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] == $state_set{done})) {
                probe_utils->send_msg("$output", "o", "[$node] deployment completed");
            } elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "deploy") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] != $state_set{done})) {
                probe_utils->send_msg("$output", "f", "[$node] deployment failed, pass $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match]} stage")
            } elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "reboot") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] == $state_set{done})) {
                probe_utils->send_msg("$output", "f", "[$node] reboot completed, without deployment process");
            } elsif (($valid_process{ $match_result{$max_match}[0] }{type} eq "reboot") && ($valid_process{ $match_result{$max_match}[0] }{process}[ $max_match - 1 ] != $state_set{done})) {
                probe_utils->send_msg("$output", "f", "[$node] reboot failed, without deployment process, stop at $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match-1]} stage, something wrong during $state_set_reverse{$valid_process{$match_result{$max_match}[0]}{process}[$max_match]} stage");
            }
        }
    }
}

#------------------------------------------

=head3
    Description:
       Monitor the process of os provision
    Returns:
        0: pass
        1: failed
=cut

#------------------------------------------
sub do_monitor {
    $SIG{TERM} = $SIG{INT} = sub {
        $terminal = 1;
    };

    my $rst = 0;
    my $startline =
      "-------------------------------------------------------------
Start capturing every message during OS provision process......
-------------------------------------------------------------
";

    print("$startline\n");

    my $varlogmsg  = "/var/log/messages";
    my $clusterlog = "/var/log/xcat/cluster.log";
    my $computelog = "/var/log/xcat/computes.log";

    my $httplog;
    if (-e "/var/log/httpd/access_log") {
        $httplog = "/var/log/httpd/access_log";
    } elsif (-e "/var/log/apache2/access_log") {
        $httplog = "/var/log/apache2/access_log";
    } elsif (-e "/var/log/apache2/access.log") {
        $httplog = "/var/log/apache2/access.log";
    }

    my $varlogpid;
    my $clusterpid;
    my $httppid;
    my $computerpid;

    {
        if (!-e $varlogmsg) {
            probe_utils->send_msg("$output", "w", "$varlogmsg doesn't exist");
        } else {
            if (!($varlogpid = open(VARLOGMSGFILE, "tail -f -n 0 $varlogmsg 2>&1 |"))) {
                probe_utils->send_msg("$output", "f", "Can't open $varlogmsg to get logs");
                $rst = 1;
                last;
            }
        }

        if (!-e "$clusterlog") {
            probe_utils->send_msg("$output", "w", "$clusterlog doesn't exist");
            probe_utils->send_msg("$output", "i", "If using SLES11 or xCAT2.11.x, ignore above warning");
        } else {
            if (!($clusterpid = open(CLUSTERLOGFILE, "tail -f -n 0 $clusterlog 2>&1 |"))) {
                probe_utils->send_msg("$output", "f", "Can't open $clusterlog to get logs");
                $rst = 1;
                last;
            }
        }
        if (!-e "$httplog") {
            probe_utils->send_msg("$output", "w", "$httplog doesn't exist");
        } else {
            if (!($httppid = open(HTTPLOGFILE, "tail -f -n 0 $httplog 2>&1 |"))) {
                probe_utils->send_msg("$output", "f", "Can't open $httplog to get logs");
                $rst = 1;
                last;
            }
        }
        if (!-e "$computelog") {
            probe_utils->send_msg("$output", "w", "$computelog doesn't exist");
            probe_utils->send_msg("$output", "i", "If sles11 or xCAT2.11.x, ignore above warning");
        } else {
            if (!($computerpid = open(COMPUTERFILE, "tail -f -n 0 $computelog 2>&1 |"))) {
                probe_utils->send_msg("$output", "f", "Can't open $computelog to get logs");
                $rst = 1;
                last;
            }
        }

        my $select = new IO::Select;
        $select->add(\*VARLOGMSGFILE);
        $select->add(\*CLUSTERLOGFILE);
        $select->add(\*HTTPLOGFILE);
        $select->add(\*COMPUTERFILE);
        $| = 1;

        my $line = "";
        my @hdls;
        my $hdl;

        my $starttime = time();
        $monitor = 1;
        for (; ;) {
            if (@hdls = $select->can_read(0)) {
                foreach $hdl (@hdls) {
                    if ($hdl == \*VARLOGMSGFILE) {
                        chomp($line = <VARLOGMSGFILE>);
                        my @tmp = split(/\s+/, $line);
                        if ($tmp[4] =~ /dhcpd/i && $line =~ /$installnic/) {
                            handle_dhcp_msg("$line");
                        } elsif ($tmp[4] =~ /in.tftpd/i) {
                            handle_tftp_msg("$line");
                        } elsif (($tmp[4] =~ /^xcat/i) || ($tmp[5] =~ /^xcat/i)) {
                            if (grep(/^$tmp[3]$/, @candidate_svr_hostname_inlog)) {
                                handle_cluster_msg("$line");
                            } else {
                                handle_compute_msg("$line");
                            }
                        }
                    } elsif ($hdl == \*CLUSTERLOGFILE) {
                        chomp($line = <CLUSTERLOGFILE>);
                        handle_cluster_msg("$line");
                    } elsif ($hdl == \*HTTPLOGFILE) {
                        chomp($line = <HTTPLOGFILE>);
                        handle_http_msg("$line");
                    } elsif ($hdl == \*COMPUTERFILE) {
                        chomp($line = <COMPUTERFILE>);
                        handle_compute_msg("$line");
                    }
                }
            }

            if ($terminal || (%monitor_nodes && all_monitor_node_done())) {
                if ($terminal) {
                    probe_utils->send_msg("$output", "d", "Get INT or TERM signal from STDIN");
                } else {
                    probe_utils->send_msg("$output", "o", "All nodes specified to monitor, have finished OS provision process");
                }
                last;
            }

            if (time() - $starttime > ($maxwaittime * 60)) {
                probe_utils->send_msg("$output", "i", "$maxwaittime minutes have expired, stop monitoring");
                last;
            }
            sleep 0.01;
        }
        &dump_history;
    }
    kill 'INT', $varlogpid   if ($varlogpid);
    kill 'INT', $clusterpid  if ($clusterpid);
    kill 'INT', $httppid     if ($httppid);
    kill 'INT', $computerpid if ($computerpid);
    close(VARLOGMSGFILE)  if (VARLOGMSGFILE);
    close(CLUSTERLOGFILE) if (CLUSTERLOGFILE);
    close(HTTPLOGFILE)    if (HTTPLOGFILE);
    close(COMPUTERFILE)   if (COMPUTERFILE);

    return $rst;
}

sub get_valid_logs {
    my $ref_timestamp        = shift;
    my $year                 = shift;
    my $epoch_seconds_of_now = shift;
    my $bthistory_ref        = shift;
    my @orglogfilelist       = ("/var/log/xcat/cluster.log",
        "/var/log/messages",
        "/var/log/xcat/computes.log");
    my $httplog;
    if (-e "/var/log/httpd/access_log") {
        $httplog = "/var/log/httpd/access_log";
    } elsif (-e "/var/log/apache2/access_log") {
        $httplog = "/var/log/apache2/access_log";
    } elsif (-e "/var/log/apache2/access.log") {
        $httplog = "/var/log/apache2/access.log";
    }
    push @orglogfilelist, $httplog;

    foreach my $f (@orglogfilelist) {
        my $filename = basename("$f");
        $filename =~ s/(.+)\.(.+)/$1/g;
        my $path_only = dirname("$f");
        my @rotatefiles;

        #TODO  using opendir to refine below code
        my @alltargetfiles = `ls -lt $path_only |awk -F" " '/ $filename/ {print \$9}'`;
        foreach my $samenamefile (@alltargetfiles) {
            chomp($samenamefile);
            push @rotatefiles, "$path_only/$samenamefile";
        }

        my $ishttplog = 0;
        $ishttplog = 1 if ($filename =~ /access/);

        foreach my $file (@rotatefiles) {
            my $fd;
            my $filetype = `file $file 2>&1`;
            chomp($filetype);
            if ($filetype =~ /ASCII/) {
                if (!open($fd, "$file")) {
                    print "open $files failed\n";
                    next;
                }
            } else {

                #TODO  handle compression files
            }

            #print "--->load $file\n";
            my $line;
            my $historynum = 0;
            last unless ($line = <$fd>);
            chomp($line);
            my $needrotate = 0;
            my $logindex   = 0;
            my @splitline  = split(/\s+/, $line);
            my $timestamp;
            my $timestampepoch;

            if ($ishttplog) {
                $splitline[3] =~ s/^\[(.+)/$1/g;
                $timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]);
            } else {
                $timestamp = join(" ", @splitline[ 0 .. 2 ]);
                $timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now);
            }
            if ($ref_timestamp <= $timestampepoch) {
                $needrotate = 1;
            } else {
                seek($fd, 0, 2);
                my $tail     = tell;
                my $head     = 0;
                my $lasttail = $tail;
                my $i        = 0;
                while ($head <= $tail) {
                    my $middle = int(($tail - $head) / 2) + $head;
                    seek($fd, $middle, 0);
                    $line = <$fd>;
                    $middle += length($line);
                    last unless ($line = <$fd>);
                    @splitline = split(/\s+/, $line);
                    if ($ishttplog) {
                        $splitline[3] =~ s/^\[(.+)/$1/g;
                        $timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]);
                    } else {
                        $timestamp = join(" ", @splitline[ 0 .. 2 ]);
                        $timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now);
                    } if ($ref_timestamp == $timestampepoch) {
                        $historynum = $middle;
                        last;
                    } elsif ($ref_timestamp < $timestampepoch) {
                        $tail = $middle;
                        last if ($tail == $lasttail);
                        $lasttail = $tail;
                    } else {
                        $head = $middle;
                    }
                }
                $historynum = $head unless ($historynum);
            }

            seek($fd, $historynum, 0);
            while (<$fd>) {
                chomp;
                @splitline = split(/\s+/, $_);
                if ($ishttplog) {
                    $splitline[3] =~ s/^\[(.+)/$1/g;
                    $timestampepoch = probe_utils->convert_to_epoch_seconds($splitline[3]);
                } else {
                    $timestamp = join(" ", @splitline[ 0 .. 2 ]);
                    $timestampepoch = probe_utils->convert_to_epoch_seconds($timestamp, $year, $epoch_seconds_of_now);
                }

                if (($splitline[4] =~ /dhcpd/i && $_ =~ /$installnic/)
                    || ($splitline[4] =~ /in.tftpd/i)
                    || (($splitline[4] =~ /^xcat/i) || ($splitline[5] =~ /^xcat/i))
                    || ($splitline[5] =~ /GET/ && $splitline[7] =~ /HTTP/)) {
                    my $log = "$timestampepoch $filename$logindex $_";
                    $logindex++;
                    push @$bthistory_ref, $log;
                }
            }
            close($fd);
            last unless ($needrotate);
        }
    }

    #sort logs depending on time
    my @sort_ht = sort(@$bthistory_ref);
    for ($i = 0 ; $i <= $#sort_ht ; $i++) {
        $sort_ht[$i] =~ s/^(\d+) (\S+) (.+)/$1 $3/g;
    }

    #delete duplicate logs
    my %count;
    @$bthistory_ref = grep { ++$count{$_} < 2; } @sort_ht;
}

sub do_replay {
    my $ref_timestamp = shift;

    my $timestr = scalar(localtime($ref_timestamp));
    print "Start logs search after '$timestr', waiting for a while.............\n";

    my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst) = localtime(time());
    my $epoch_seconds_of_now = time();
    my @bthistory;
    get_valid_logs($ref_timestamp, $year, $epoch_seconds_of_now, \@bthistory);

    foreach my $line (@bthistory) {
        $line =~ s/(\d+) (.+)/$2/g;
        my @tmp = split(/\s+/, $line);
        if ($tmp[4] =~ /dhcpd/i && $line =~ /$installnic/) {
            handle_dhcp_msg("$line");
        } elsif ($tmp[4] =~ /in.tftpd/i) {
            handle_tftp_msg("$line");
        } elsif (($tmp[4] =~ /^xcat/i) || ($tmp[5] =~ /^xcat/i)) {
            if (grep(/^$tmp[3]$/, @candidate_svr_hostname_inlog)) {
                handle_cluster_msg("$line");
            } else {
                handle_compute_msg("$line");
            }
        } elsif ($tmp[5] =~ /GET/ && $tmp[7] =~ /HTTP/) {
            handle_http_msg("$line");
        }
    }
    &dump_history;
    return 0;
}

#-------------------------------------
# main process
#-------------------------------------
if (
    !GetOptions("--help|h|?" => \$help,
        "T"   => \$test,
        "V"   => \$verbose,
        "t"   => \$maxwaittime,
        "r=s" => \$replaylog,
        "n=s" => \$noderange))
{
    probe_utils->send_msg("$output", "f", "Invalid parameter for $program_name");
    probe_utils->send_msg("$output", "d", "$::USAGE");
    exit 1;
}

if ($help) {
    if ($output ne "stdout") {
        probe_utils->send_msg("$output", "d", "$::USAGE");
    } else {
        print "$::USAGE";
    }
    exit 0;
}

if ($test) {
    probe_utils->send_msg("$output", "o", "Probe for OS provision process, realtime monitor of OS provision process.");
    exit 0;
}

unless ($noderange) {
    probe_utils->send_msg("$output", "f", "A noderange is required");
    probe_utils->send_msg("$output", "d", "$::USAGE");
    exit 1;
}

my $epoch_starttime = time();
if ($replaylog) {
    if ($replaylog =~ /(\d+)h(\d+)m/i) {
        $epoch_starttime -= ($1 * 3600 + $2 * 60)
    } elsif ($replaylog =~ /^(\d+)h*$/i) {
        $epoch_starttime -= $1 * 3600;
    } elsif ($replaylog =~ /^(\d+)m$/) {
        $epoch_starttime -= $1 * 60;
    } else {
        probe_utils->send_msg("$output", "f", "Unsupported time format for history log replay");
        print "$::USAGE";
        exit 1;
    }
}

my $msg = "All pre_defined nodes are valid";
my $rc  = check_noderange($noderange);
if ($rc) {
    probe_utils->send_msg("$output", "f", $msg);
    $rst = 1;
} else {
    probe_utils->send_msg("$output", "o", $msg);
}

unless ($installnic) {
    my $masteripinsite = `tabdump site | awk -F',' '/^"master",/ { gsub(/"/, "", \$2) ; print \$2 }'`;
    chomp($masteripinsite);
    $installnic = `ip addr |grep -B2 $masteripinsite|awk -F" " '/mtu/{gsub(/:/,"",\$2); print \$2}'`;
    chomp($installnic);
    if (!$installnic) {
        probe_utils->send_msg("$output", "f", "The value of 'master' in 'site' table is $masteripinsite, can't get corresponding network interface");
        $rst = 1;
    } else {
        probe_utils->send_msg("$output", "i", "The installation network interface is $installnic");
    }
}

exit $rst if ($rst);

foreach my $node (keys %monitor_nodes) {
    $rawdata{$node}{state} = $state_set{unknown};
    push(@{ $rawdata{$node}{statehistory} }, $rawdata{$node}{state});
}

if ($replaylog) {
    $rst = do_replay($epoch_starttime);
    exit $rst;
}

$rst = do_monitor();

exit $rst;