#!/usr/bin/perl
# IBM(c) 2008 EPL license http://www.eclipse.org/legal/epl-v10.html

###########################################################################
#                                                                         #
# Command: healthCheck                                                    #
#                                                                         #
#-------------------------------------------------------------------------#
#    This script is used to check the system health for both AIX and 
#    Linux Managed Nodes on Power6 platforms. It will use xdsh to access
#    the target nodes, and check the status for processor clock speed,
#    IB interfaces, memory, large page configuration and HCA status.
#    If xdsh is unreachable, an error message will be given. 
# Command Syntax:
#    healthCheck { [-n node_list] [-M]}
#                {[-p min_clock_speed] [-i method] [-m min_memory]
#                [-l min_freelp] [ -H [--speed speed --ignore interface_list --width width]]} 
#                [ -h ]
#
#        -M          Check status for all the Managed Nodes that are defined on this MN.
#        -n node_list
#                    Specifies a comma-separated list of node host names, IP addresses for health check.
#        -p min_clock_speed
#                    Specifies the minimal processor clock speed in MHz for processor monitor.
#        -i method
#                    Specifies the method to do Infiniband interface status check, the supported 
#                    check methods are LL and RSCT.
#        -m min_memory
#                    Specifies the minimal total memory in MB.
#        -l min_freelp
#                    Specifies the minimal free large page number.
#        -H          Check the status for HCAs.
#        --speed speed
#                    Specifies the physical port speed in G bps, it should be used with -H flag.
#        --ignore interface_list
#                    Specifies a comma-separated list of interface name to ignore from HCA status check, 
#                    such as ib0,ib1. It should be used with -H flag.
#        --width width
#                    Specifies the physical port width, such as 4X or 12X. It should be used with -H flag.
#        -h          Display usage information.
# Exit codes:
#     0 - success
#     1 - fail
###########################################################################

use strict;
use Getopt::Long;

# Log file
$::HEALTHCHECK_PATH = "/var/log/xcat";
$::HEALTHCHECK_LOG = "$::HEALTHCHECK_PATH/healthCheck.log";

# variables and Commands
$::OK = 0;
$::NOK = 1;
$::logging = 0;
$::GLOBAL_EXIT = 0;
$::NODELS = "/opt/xcat/bin/nodels";
$::LinuxProcCmd = "cat /proc/cpuinfo";
$::AIXProcCmd = "/usr/pmapi/tools/pmcycles";
$::IBifLLCmdL = "/opt/ibmll/LoadL/full/bin/llstatus";
$::IBifLLCmdA = "/usr/lpp/LoadL/full/bin/llstatus";
$::IBifRSCTCmd = "/usr/bin/lsrsrc";
$::AIXMemCmd = "/usr/bin/vmstat";
$::LinuxMemCmd = "cat /proc/meminfo";
$::AIXHCACmd = "/usr/bin/ibstat";
$::LinuxHCACmd = "/usr/bin/ibv_devinfo";

#Nodes to be checked
my @NodeList;
my @LnxNodeList;
my @AIXNodeList;
#Ignored ib interfaces when checking HCA status
my @IgnoreList;

# MAIN Main main#
# Append logging information to getGuids.log
&append_logging($::HEALTHCHECK_LOG);
$::logging++;

&getArgs;

my @ReachableNodes;
my @UnreachableNodes;
if (scalar(@NodeList) > 0)
{
    # Check if xdsh is reachable
    foreach my $node (@NodeList)
    {
        my $rc = &checkDshReachability($node);
        if ($rc == 0)
        { # xdsh is ok
            push @ReachableNodes, $node;
        }
        else
        {
            push @UnreachableNodes, $node;
        }
    }

    if (scalar (@UnreachableNodes))
    {
        my $UnreachableNodes = join (", ", @UnreachableNodes);
        print "Warning: The xdsh is unreachable for the node(s): $UnreachableNodes. Run updatenode to configure xdsh for that node(s).\n";
        print $::LOG_FILE_HANDLE "Warning: The xdsh is unreachable for the node(s): $UnreachableNodes. Run updatenode to configure xdsh for that node(s).\n";
    }
}
else
{
    print "Error: No hosts in node list\n";
    print $::LOG_FILE_HANDLE "Error: No hosts in node list\n";
    $::GLOBAL_EXIT = $::NOK;
    exit;
}

#Divide the nodes into two groups: Linux nodes and AIX nodes
my @AllLnxNodes = `$::NODELS all nodetype.os | grep -E "sles|rhel"`;
print $::LOG_FILE_HANDLE "Running command: $::NODELS all nodetype.os | grep -E sles|rhel\n";
chomp @AllLnxNodes;
my @AllAIXNodes = `$::NODELS all nodetype.os | grep "AIX"`;
print $::LOG_FILE_HANDLE "Running command: $::NODELS all nodetype.os | grep AIX\n";
chomp @AllAIXNodes;

foreach my $node (@ReachableNodes)
{
    if ( grep {$_ =~ /$node/} @AllLnxNodes )
    {
        push @LnxNodeList, $node;
    }
    if ( grep {$_ =~ /$node/} @AllAIXNodes )
    {
        push @AIXNodeList, $node;
    }
}

#Do health check for Linux nodes and AIX nodes
if (scalar (@LnxNodeList))
{
    my $rc = &healthCheckLinux(\@LnxNodeList);
    if ($rc)
    {
        $::GLOBAL_EXIT = $rc;
        exit;
    }
}
if (scalar (@AIXNodeList))
{
    my $rc = &healthCheckAIX(\@AIXNodeList);
    if ($rc)
    {
        $::GLOBAL_EXIT = $rc;
        exit;
    }
}

# Finish up and exit
END
{
	if ($::logging)
    {
		&stop_logging();
	}
	#Determine exit code
	if ($::GLOBAL_EXIT > $?)
    {
		$? = $::GLOBAL_EXIT;
	}
}
exit;  # end of Main

#--------------------------------------------------------------------------------

=head3  getArgs
        Parse the command line and check the values
=cut

#--------------------------------------------------------------------------------
sub getArgs()
{
    $Getopt::Long::ignorecase = 0;
    my $rc = GetOptions(
        'h'        => \$::HELP,
        'M'        => \$::ALLMANAGED,
        'n=s'      => \$::NODE_LIST,
        'p=f'      => \$::MIN_CLOCK_SPEED,
        'i=s'      => \$::IF_CHECK_METHOD,
        'm=f'      => \$::MIN_MEMORY,
        'l=i'      => \$::MIN_FREELP,
        'H'        => \$::HCA,
        'speed=f'  => \$::HCASPEED,
        'ignore=s' => \$::IGNORE_LIST,
        'width=s'  => \$::WIDTH
    );
    if ($::HELP)
    {
        &usage();
        $::GLOBAL_EXIT = $::OK;
        exit;
    }
    if (!$rc)
    { #There are syntax errors in parameters
        &usage();
        $::GLOBAL_EXIT = $::NOK;
        exit;
    }
    if ($::IF_CHECK_METHOD)
    {
        if ($::IF_CHECK_METHOD ne "LL" && $::IF_CHECK_METHOD ne "RSCT")
        {
            print "The method to check IB interface must be \"LL\" or \"RSCT\".\n";
            &usage();
            $::GLOBAL_EXIT = $::NOK;
            exit;
        }
    }
    if(!defined($::MIN_CLOCK_SPEED) && !defined($::IF_CHECK_METHOD) && !defined($::MIN_MEMORY) && !defined($::MIN_FREELP) && !defined($::HCA))
    {
        print "There is nothing to check. Please specify what you want to check.\n";
        &usage();
        $::GLOBAL_EXIT = $::NOK;
        exit;
    }
    
    if(!defined($::HCA) && (defined($::HCASPEED) || $::IGNORE_LIST ne "" || $::WIDTH ne ""))
    {
        print "The flag --speed, --ignore or --width should be used with -H flag.\n";
        &usage();
        $::GLOBAL_EXIT = $::NOK;
        exit;
    }
    
    if($::WIDTH ne "")
    {
        if(!($::WIDTH =~ /^(\d+)X$/))
        {
            print "The parameter of --width should be 4X, 12X, etc.\n";
            &usage();
            $::GLOBAL_EXIT = $::NOK;
            exit;
        }
    }

    #Parse the node list
    if ($::NODE_LIST)
    {    
        @NodeList = `$::NODELS $::NODE_LIST`; chomp @NodeList;
        if ($?) {
            print $::LOG_FILE_HANDLE  "Running command: $::NODELS $::NODE_LIST fail\n";
            exit;
        } else {
            print $::LOG_FILE_HANDLE  "Running command: $::NODELS $::NODE_LIST\n";
        }
    }

    #If -M is set, list all nodes in Managed mode
    if ($::ALLMANAGED)
    {
        my @templist = `$::NODELS`;
        chomp @templist;
        print $::LOG_FILE_HANDLE  "Running command: lsnode -w \"Mode='Managed' or Mode='MinManaged'\"\n";
        @NodeList = @templist;
    }
    if ($::IGNORE_LIST)
    {
        @IgnoreList = split /,/, $::IGNORE_LIST;
    }
}

#--------------------------------------------------------------------------------

=head3  usage

        usage for healthCheck
=cut

#--------------------------------------------------------------------------------
sub usage()
{
    print
"Usage: healthCheck { [-n node_list] [-M]}
                    {[-p min_clock_speed] [-i method] [-m min_memory]
                    [-l min_freelp] [ -H [--speed speed --ignore interface_list --width width]]} 
                    [ -h ]

        -M          Check status for all the Managed Nodes that are defined on this MS.
        -n node_list
                    Specifies a comma-separated list of node host names, IP addresses for health check.
        -p min_clock_speed
                    Specifies the minimal processor clock speed in MHz for processor monitor.
        -i method
                    Specifies the method to do Infiniband interface status check, the supported 
                    check methods are LL and RSCT.
        -m min_memory
                    Specifies the minimal total memory in MB.
        -l min_freelp
                    Specifies the minimal free large page number.
        -H          Check the status for HCAs.
        --speed speed
                    Specifies the physical port speed in G bps, it should be used with -H flag.
        --ignore interface_list
                    Specifies a comma-separated list of interface name to ignore from HCA status check, 
                    such as ib0,ib1. It should be used with -H flag.
        --width width
                    Specifies the physical port width, such as 4X or 12X. It should be used with -H flag.
        -h          Display usage information.
";         
}

#--------------------------------------------------------------------------------

=head3   checkDshReachability

        Notes: Check the xdsh reachability between the Management Server
               and node.

Arguments:
        $node - the remote node hostname.

Returns:
        $::OK - The remote node is reachable through xdsh.
        $::NOK - The remote node is unreachable through xdsh.

=cut

#--------------------------------------------------------------------------------
sub checkDshReachability()
{
    my ($node) = @_;
    my $output = `xdsh $node date 2>/dev/null`;
    print $::LOG_FILE_HANDLE  "Running command: xdsh $node date 2>/dev/null\n";
    if ($? == $::OK)
    {
        return $::OK;
    }
    return $::NOK;
}

#--------------------------------------------------------------------------------

=head3   checkCmdAvailability

        Check availability of some command on the nodes.

Arguments:
        $refNodes - The reference to a group of nodes.
        $cmd - The command that will be checked.
Returns:
        @ValidNodes - A group of nodes that the command is available on these nodes.
=cut

#--------------------------------------------------------------------------------
sub checkCmdAvailability
{
    my ($refNodes, $cmd) = @_;
    my @ValidNodes;
    my @BadNodes;
    foreach my $node (@$refNodes)
    {
        #Use xdsh to "ls" the command 
        `xdsh $node ls $cmd 2>/dev/null`;
        print $::LOG_FILE_HANDLE "Running command: xdsh $node ls $cmd 2>/dev/null\n";
        if ($? == $::OK)
        {
            push @ValidNodes, $node;
        }
        else
        {
            push @BadNodes, $node;
        }
    }

    if (scalar (@BadNodes))
    {
        my $BadNodes = join (", ", @BadNodes);
        print "Warning: The command $cmd is not available on the node(s): $BadNodes.\n";
        print $::LOG_FILE_HANDLE "Warning: The command $cmd is not available on the node(s): $BadNodes.\n";
    }
    return @ValidNodes;
}

#-------------------------------------------------------------------------------

=head3    append_logging

	Append logging messages to a logfile.

=cut

#-------------------------------------------------------------------------------
sub append_logging()
{
    my ($logfile) = @_;
    my ($cmd,  $rc);

    #  get log file ready
    if (!-e $logfile)
    {
        `mkdir -p $::HEALTHCHECK_PATH`;
        #  create the log file if not already there
        unless (open(LOGFILE, ">$logfile"))
        {
            # Cannot open file
            print "Can't open file \"$logfile\" for writing.\n";
            return $::NOK;
        }
    }
    else
    {
        # it's there so just append
        unless (open(LOGFILE, ">>$logfile"))
        {
            print "Can't update file  \"$logfile\".\n";
            return $::NOK;
        }
    }

    $::LOG_FILE_HANDLE = \*LOGFILE;

    # Print the date to the top of the logfile
    my $sdate = `/bin/date`;
    chomp $sdate;
    print "Output log is being written to \"$logfile\".\n";

    print $::LOG_FILE_HANDLE
        "---------------------------------------------------------------------\n";
    print $::LOG_FILE_HANDLE "Logging started $sdate.\n";
    print $::LOG_FILE_HANDLE
        "---------------------------------------------------------------------\n";

    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    stop_logging

	Turn off message logging.
	
=cut

#-------------------------------------------------------------------------------
sub stop_logging()
{
    # Print the date at the bottom of the logfile
    my $sdate = `/bin/date`;
    chomp $sdate;
    print $::LOG_FILE_HANDLE
        "---------------------------------------------------------------------\n";
    print $::LOG_FILE_HANDLE "Logging stopped $sdate.\n";
    print $::LOG_FILE_HANDLE
        "---------------------------------------------------------------------\n";

    close($::LOG_FILE_HANDLE);
    $::LOG_FILE_HANDLE = undef;

    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    healthCheckLinux

          Check Linux nodes
Arguments:
        $refLnxNodes - The reference to the group of Linux nodes.
=cut

#-------------------------------------------------------------------------------
sub healthCheckLinux()
{
    my ($refLnxNodes) = @_;
    my $allnodes = join ", ", @$refLnxNodes;
    print "Checking health for Linux nodes: $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking health for Linux nodes: $allnodes...\n";    
    my $rc;
    if ( defined($::MIN_CLOCK_SPEED) )
    {
        $rc = procsrCheckLinux($refLnxNodes);
        if ($rc != $::OK) {
            return $::NOK;
        }
    }
    if ( defined($::MIN_MEMORY) || defined($::MIN_FREELP) )
    {
        $rc = memCheckLinux($refLnxNodes);
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
    if ( $::IF_CHECK_METHOD ne "" )
    {
        $rc = ifCheck($refLnxNodes, 'Linux');
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
    if ( defined($::HCA) )
    {
        $rc = HCACheckLinux($refLnxNodes);
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
}

#-------------------------------------------------------------------------------

=head3    healthCheckAIX

          Check AIX nodes
Arguments:
        $refAIXNodes - The reference to the group of AIX nodes.
=cut

#-------------------------------------------------------------------------------
sub healthCheckAIX()
{
    my ($refAIXNodes) = @_;
    my $allnodes = join ", ", @$refAIXNodes;
    print "Checking health for AIX nodes: $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking health for AIX nodes: $allnodes...\n";
    
    my $rc;
    if ( defined($::MIN_CLOCK_SPEED) )
    {
        $rc = procsrCheckAIX($refAIXNodes);
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
    if ( defined($::MIN_MEMORY) || defined($::MIN_FREELP) )
    {
        $rc = memCheckAIX($refAIXNodes);
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
    if ( $::IF_CHECK_METHOD ne "" )
    {
        $rc = ifCheck($refAIXNodes, 'AIX');
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
    if ( defined($::HCA) )
    {
        $rc = HCACheckAIX($refAIXNodes);
        if ($rc != $::OK)
        {
            return $::NOK;
        }
    }
}

#-------------------------------------------------------------------------------

=head3    procsrCheckLinux

          Check processor clock speed for Linux nodes
Arguments:
        $refLnxNodes - The reference to the group of Linux nodes.
=cut

#-------------------------------------------------------------------------------
sub procsrCheckLinux()
{
    my ($refLnxNodes) = @_;
    my $ValidLnxNodes = join (",", @$refLnxNodes);

    print "\nChecking processor clock speed for nodes: $ValidLnxNodes...\n";
    print $::LOG_FILE_HANDLE "Checking processor clock speed for nodes: $ValidLnxNodes...\n";

    my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxProcCmd\" 2>/dev/null";
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
    my @output = `$checkCmd`;
    if ($?)
    {
        print "Command failed: $checkCmd.\n";
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
        return $::NOK;
    }
    
    my $host;
    my $pcrnum;
    my $clspeed;
    my %result;
    foreach my $line (@output)
    {
        chomp $line;
        # Get node hostname
        if ($line =~ /(\S*):.*/)
        {
            $host = $1;
        }
        if ($line =~ /\S+: processor\s*: (\d+)/)
        {
            #Get processor number
            $pcrnum = $1;
        }
        if ($line =~ /\S+: clock\s*: (.*)MHz/)
        {
            #Get processor clock speed
            $clspeed = $1;
            if ($clspeed < $::MIN_CLOCK_SPEED)
            {
                $result{$host} = "$result{$host}Processor $pcrnum clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
            }
        }
    }
    if (%result)
    {
        foreach my $h (sort keys %result)
        {
            print "$h:\n$result{$h}";
            print $::LOG_FILE_HANDLE "$h:\n$result{$h}";
        }
    }
    else
    {
        print "The processor clock speed of all nodes is normal.\n";
        print $::LOG_FILE_HANDLE "The processor clock speed of all nodes is normal.\n";
    }
    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    procsrCheckAIX

	      Check processor clock speed for AIX nodes
Arguments:
        $refAIXNodes - The reference to the group of AIX nodes.
=cut

#-------------------------------------------------------------------------------
sub procsrCheckAIX()
{
    my ($refAIXNodes) = @_;
    my @ValidAIXNodes;
    my $abnormal = 0;
    my $allnodes = join ", ", @$refAIXNodes;
    print "\nChecking processor clock speed for nodes: $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking processor clock speed for nodes: $allnodes...\n";
    
    @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXProcCmd);

    if (!scalar (@ValidAIXNodes))
    {
        return $::NOK;
    }
    my $ValidAIXNodes = join (",", @ValidAIXNodes);
    my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXProcCmd\" 2>/dev/null";
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
    my @output = `$checkCmd`;
    if ($?)
    {
        print "Command failed: $checkCmd.\n";
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
        return $::NOK;
    }
    
    my $host;
    my $pcrnum;
    my $clspeed;
    foreach my $line (@output)
    {
        chomp $line;
        # Get node hostname
        if ($line =~ /(\S*):.*/)
        {
            $host = $1;
        }
        if ($line =~ /\S+:\s*This machine runs at (\d+) MHz/)
        {
            #Get processor clock speed
            $clspeed = $1;
            if ($clspeed < $::MIN_CLOCK_SPEED)
            {
                $abnormal = 1;
                print "$host processor clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
                print $::LOG_FILE_HANDLE "$host processor clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
            }
        }
    }
    if(!$abnormal)
    {
        print "The processor clock speed of all nodes is normal.\n";
        print $::LOG_FILE_HANDLE "The processor clock speed of all nodes is normal.\n";
    }
    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    ifCheck

          Check ib interfaces status
Arguments:
        $refNodes - The reference to all nodes.
        $os - Linux or AIX
=cut

#-------------------------------------------------------------------------------
sub ifCheck()
{
    my ($refNodes, $os) = @_;
    my @ValidNodes;
    my $cmd;
    my $abnormal = 0;
    my $allnodes = join ", ", @$refNodes;
    
    if ($::IF_CHECK_METHOD eq "LL")
    {
        if ($os eq 'Linux')
        {
            $cmd = $::IBifLLCmdL;
        }
        elsif ($os eq 'AIX')
        {
            $cmd = $::IBifLLCmdA;
        }
        else
        {
            return $::NOK;
        }
    }
    elsif ($::IF_CHECK_METHOD eq "RSCT")
    {
        $cmd = $::IBifRSCTCmd;
    }
    else
    {
        return $::NOK;
    }
    
    print "\nChecking IB interface status using command $cmd for nodes: $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking IB interface status using command $cmd for nodes: $allnodes...\n";
    
    @ValidNodes = &checkCmdAvailability($refNodes, $cmd);
    
    if (!scalar (@ValidNodes))
    {
        return $::NOK;
    }
    
    my $ValidNodes = join (",", @ValidNodes);
    
    #Method is "LL"
    if ($::IF_CHECK_METHOD eq "LL")
    {
        #Run the LL command on only one node, this is enough because it will return status of all nodes
        my $checkCmd = "xdsh $ValidNodes[0] \"$cmd -a\" 2>/dev/null";
        print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
        my @output = `$checkCmd`;
        if ($?)
        {
            print "Command failed: $checkCmd.\n";
            print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
            return $::NOK;
        }
        
        #Record whether the next line is the beginning of a new section
        my $armed = 0;
        my $ibnum;
        my $host;
        #Record abnormal ib interfaces of a node
        my @ib_array = ();
        foreach my $line (@output)
        {
            chomp $line;
            #The text returned by llstatus is seperared into several sections by "=================="
            if ($line =~ /==================/)
            {
                #If there are abnormal interfaces found after check last section
                if ( @ib_array )
                {
                    print "$host IB interface(s) down: ";
                    print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
                    foreach (sort @ib_array)
                    { 
                        print "ib$_ " ;
                        print $::LOG_FILE_HANDLE "ib$_ " ;
                    }
                    print "\n";
                    print $::LOG_FILE_HANDLE "\n";
                }
                @ib_array = ();
                $armed = 1;
            }
            elsif ( ( $armed ) && ($line =~ /\S+?:\s*(\S+)/) )
            { #The first line of a new section
                #Get node hostname
                $host = $1;
                $armed = 0;
            }
            elsif ( $line =~ /\S+?:\s*ib(\d+).*/ )
            {
                #Get interface number
                $ibnum = $1;
                #Check the status
                if ( ! ($line =~ /READY/) )
                {
                    $abnormal = 1;                
                    push @ib_array, $ibnum;
                }
            }
        }
        #If there are abnormal interfaces found after check the last section
        if ( @ib_array )
        {
            print "$host IB interface(s) down: ";
            print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
            foreach (sort @ib_array)
            { 
                print "ib$_ " ;
                print $::LOG_FILE_HANDLE "ib$_ " ;
            }
            print "\n";
            print $::LOG_FILE_HANDLE "\n";
        }
    }
    #Method is "RSCT"
    elsif ($::IF_CHECK_METHOD eq "RSCT")
    {
        my $checkCmd = "xdsh $ValidNodes \"$::IBifRSCTCmd IBM.NetworkInterface Name OpState\" 2>/dev/null";
        print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
        my @output = `$checkCmd`;
        if ($?)
        {
            print "Command failed: $checkCmd.\n";
            print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
            return $::NOK;
        }
        my $host;
        my $prevhost;
        my $ifname;
        my $opstate;
        my @ib_array = ();
        foreach my $line (@output)
        {
            chomp $line;
            if ($line =~ /(\S*):.*/)
            {
                #Get node hostname
                $host = $1;
                #If the host is not equal prevhost, it indicates this is beginning of a section for another node.
                #If there are abnormal interfaces found after check last node, print them.
                if (($host ne $prevhost) && @ib_array)
                {
                    print "$prevhost IB interface(s) down: ";
                    print $::LOG_FILE_HANDLE "$prevhost IB interface(s) down: ";
                    foreach (sort @ib_array)
                    { 
                        print "$_ " ;
                        print $::LOG_FILE_HANDLE "$_ " ;
                    }
                    print "\n";
                    print $::LOG_FILE_HANDLE "\n";
                    @ib_array = ();
                }
                $prevhost = $host;                
            }
            if ($line =~ /\S*:\s*Name\s*=\s*\"(.*)\"/)
            {
                #Get interface name
                $ifname = $1;
            }
            if ($line =~ /\S+:\s*OpState\s*=\s*(\d+)/)
            {
                #Check the status
                $opstate = $1;
                if ($opstate != 1 && $ifname =~ /ib/)
                {
                    $abnormal = 1;
                    push @ib_array, $ifname;
                }
            }
        }
        #If there are abnormal interfaces found after check the last node, print them.
        if (@ib_array)
        {
            print "$host IB interface(s) down: ";
            print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
            foreach (sort @ib_array)
            { 
                print "$_ " ;
                print $::LOG_FILE_HANDLE "$_ " ;
            }
            print "\n";
            print $::LOG_FILE_HANDLE "\n";
        }
    }
    if(!$abnormal)
    {
        print "IB interfaces of all nodes are normal.\n";
        print $::LOG_FILE_HANDLE "IB interfaces of all nodes are normal.\n";
    }
    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    memCheckLinux

          Check memory and free large page for Linux nodes
Arguments:
        $refLnxNodes - The reference to the group of Linux nodes.
=cut

#-------------------------------------------------------------------------------
sub memCheckLinux()
{
    my ($refLnxNodes) = @_;
    my $abnormalmem = 0;
    my $abnormalflp = 0;
    my $ValidLnxNodes = join (",", @$refLnxNodes);

    print "\nChecking memory for nodes: $ValidLnxNodes...\n";
    print $::LOG_FILE_HANDLE "Checking memory for nodes: $ValidLnxNodes...\n";

    my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxMemCmd\" 2>/dev/null";
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
    my @output = `$checkCmd`;
    if ($?)
    {
        print "Command failed: $checkCmd.\n";
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
        return $::NOK;
    }
    
    my $host;
    my $mem;
    my $freelp;
    foreach my $line (@output)
    {
        chomp $line;
        if ($line =~ /(\S*): MemTotal:\s*(\d+) kB/ && defined($::MIN_MEMORY))
        {
            # Get node hostname and memory size
            $host = $1;
            $mem = $2;
            if ($mem < ($::MIN_MEMORY * 1000))
            {
                $abnormalmem = 1;
                my $mem2 = $mem / 1000;
                print "$host total memory is $mem2 MB. UNEXPECTED small size.\n";
                print $::LOG_FILE_HANDLE "$host total memory is $mem kB. UNEXPECTED small size.\n";
            }
        }
        if ($line =~ /(\S*): HugePages_Free:\s*(\d+)/ && defined($::MIN_FREELP))
        {
            # Get node hostname and number of free large page
            $host = $1;
            $freelp = $2;
            if ($freelp < $::MIN_FREELP)
            {
                $abnormalflp = 1;
                print "$host Free large page is $freelp. UNEXPECTED small size.\n";
                print $::LOG_FILE_HANDLE "$host Free large page is $freelp. UNEXPECTED small size.\n";
            }
        }
    }
    if(!$abnormalmem && defined($::MIN_MEMORY))
    {
        print "Memory size of all nodes are normal.\n";
        print $::LOG_FILE_HANDLE "Memory size of all nodes are normal.\n"
    }
    if(!$abnormalflp && defined($::MIN_FREELP))
    {
        print "Free large page number of all nodes are normal.\n";
        print $::LOG_FILE_HANDLE "Free large page number of all nodes are normal.\n";
    }
    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    memCheckAIX

          Check memory and free large page for AIX nodes
Arguments:
        $refAIXNodes - The reference to the group of AIX nodes.
=cut

#-------------------------------------------------------------------------------
sub memCheckAIX()
{
    my ($refAIXNodes) = @_;
    my $abnormalmem = 0;
    my $abnormalflp = 0;
    my @ValidAIXNodes;
    my $allnodes = join ", ", @$refAIXNodes;
    print "\nChecking memory for nodes $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking memory for nodes $allnodes...\n";
    
    @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXMemCmd);
    
    if (!scalar (@ValidAIXNodes))
    {
        return $::NOK;
    }
    my $ValidAIXNodes = join (",", @ValidAIXNodes);
    my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXMemCmd -l\" 2>/dev/null";

    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
    my @output = `$checkCmd`;
    if ($?)
    {
        print "Command failed: $checkCmd.\n";
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
        return $::NOK;
    }
    
    my $host;
    my $mem;
    my $freelp;
    foreach my $line (@output)
    {
        chomp $line; 
        if ($line =~ /(\S*):\s+System Configuration: lcpu=(\d+)\s+mem=(\d+)\s*MB/ && defined($::MIN_MEMORY))
        {
            # Get node hostname and memory size
            $host = $1;
            $mem = $3;
            if ($mem < $::MIN_MEMORY)
            {
                $abnormalmem = 1;
                print "$host total memory is $mem MB. UNEXPECTED small size\n";
                print $::LOG_FILE_HANDLE "$host total memory is $mem MB. UNEXPECTED small size\n";
            }
        }
        elsif ($line =~ /(\S*):\s*\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)/ && defined($::MIN_FREELP))
        {
            # Get node hostname and number of free large page
            $host = $1;
            $freelp = $2;
            if ($freelp < $::MIN_FREELP)
            {
                $abnormalflp = 1;
                print "$host Free large page is $freelp. UNEXPECTED small size\n";
                print $::LOG_FILE_HANDLE "$host Free large page is $freelp. UNEXPECTED small size\n";
            }
        }
        elsif ($line =~ /(\S*):.*lcpu=(\d+)\s+mem=(\d+)MB/ && defined($::MIN_MEMORY) && $abnormalmem != 1)
        {
            # Get node hostname and memory size
            $host = $1;
            $mem = $3;
            if ($mem < $::MIN_MEMORY)
            {
                $abnormalmem = 1;
                print "$host total memory is $mem MB. UNEXPECTED small size\n";
                print $::LOG_FILE_HANDLE "$host total memory is $mem MB. UNEXPECTED small size\n";
            }
        }
    }
    if(!$abnormalmem && defined($::MIN_MEMORY))
    {
        print "Memory size of all nodes are normal.\n";
        print $::LOG_FILE_HANDLE "Memory size of all nodes are normal.\n"
    }
    if(!$abnormalflp && defined($::MIN_FREELP))
    {
        print "Free large page number of all nodes is normal.\n";
        print $::LOG_FILE_HANDLE "Free large page number of all nodes is normal.\n";
    }
    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    HCACheckAIX

          Check HCA status for AIX nodes
Arguments:
        $refAIXNodes - The reference to the group of AIX nodes.
=cut

#-------------------------------------------------------------------------------
sub HCACheckAIX()
{
    my ($refAIXNodes) = @_;
    my @ValidAIXNodes;
    my $abnormal = 0;
    my $allnodes = join ", ", @$refAIXNodes;
    print "\nChecking HCA status for nodes $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking HCA status for nodes $allnodes...\n";
    
    @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXHCACmd);

    if (!scalar (@ValidAIXNodes))
    {
        return $::NOK;
    }
    my $ValidAIXNodes = join (",", @ValidAIXNodes);
    my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXHCACmd -v | egrep 'IB PORT.*INFO|Port State:|Physical Port'\" 2>/dev/null";
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
    my @output = `$checkCmd`;
    if ($?)
    {
        print "Command failed: $checkCmd.\n";
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
        return $::NOK;
    }
    
    my $host;
    my $hca_id;
    my $port;
    my $if_id;
    my $ignore = 0;
    foreach my $line (@output)
    {
        chomp $line;
        #Get host name
        if ($line =~ /(\S*): (.*)/)
        {
            $host = $1;
        }
        #Get HCA ID and port number
        if ($line =~ /\S*:\s*IB PORT (\d+) INFORMATION \(iba(\d+)\)/)
        {
            $port = $1;
            $hca_id = $2;
            #Calculate interface number according to HCA ID and port number
            $if_id = $hca_id * 2 + $port - 1;
            #If the interface is in ignore list
            if ( grep {$_ eq "ib$if_id"} @IgnoreList )
            {
                $ignore = 1;
            }
            else
            {
                $ignore = 0;
            }
            next;
        }
        #Check Logical Port State
        if ($line =~ /\S*:\s*Logical Port State:\s*(\w+)/ && $ignore == 0)
        {
            if ($1 ne "Active")
            {
                $abnormal = 1;
                print "$host: ib$if_id Logical Port State is $1. UNEXPECTED STATE.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Logical Port State is $1. UNEXPECTED STATE.\n";
            }
            else
            {
                print "$host: ib$if_id Logical Port State is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Logical Port State is $1.\n";
            }
            next;
        }
        #Check Physical Port State
        if ($line =~ /\S*:\s*Physical Port State:\s*(\w+)/ && $ignore == 0)
        {
            if ($1 ne "Active")
            {
                $abnormal = 1;
                print "$host: ib$if_id Physical Port State is $1. UNEXPECTED STATE.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port State is $1. UNEXPECTED STATE.\n";
            }
            else
            {
                print "$host: ib$if_id Physical Port State is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port State is $1.\n";
            }
            next;
        }
        #Check Physical Port Physical State
        if ($line =~ /\S*:\s*Physical Port Physical State:\s*(\.+)/ && $ignore == 0)
        {
            if ($1 ne "Link Up")
            {
                $abnormal = 1;
                print "$host: ib$if_id Physical Port Physical State is $1. UNEXPECTED STATE.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Physical State is $1. UNEXPECTED STATE.\n";
            }
            else
            {
                print "$host: ib$if_id Physical Port Physical State is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Physical State is $1.\n";
            }
            next;
        }
        #Check speed
        if ($line =~ /\S*:\s*Physical Port Speed:\s*(.+)\s*G/ && $ignore == 0)
        {
            if (!defined($::HCASPEED))
            {
                print "$host: ib$if_id speed is $1Gbps. \n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps.\n";
            }
            elsif ($1 != $::HCASPEED)
            {
                $abnormal = 1;
                print "$host: ib$if_id speed is $1Gbps. UNEXPECTED SPEED.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps. UNEXPECTED SPEED.\n";
            }
            else
            {
                print "$host: ib$if_id speed is $1Gbps. \n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps.\n";
            }
            next;
        }
        #Ckeck width
        if ($line =~ /\S*:\s*Physical Port Width:\s*(\w+)/ && $ignore == 0)
        {
            if ($::WIDTH eq "")
            {
                print "$host: ib$if_id Physical Port Width is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1.\n";
            }
            elsif ($1 ne $::WIDTH)
            {
                $abnormal = 1;
                print "$host: ib$if_id Physical Port Width is $1. UNEXPECTED WIDTH.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1. UNEXPECTED WIDTH.\n";
            }
            else
            {
                print "$host: ib$if_id Physical Port Width is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1.\n";
            }
            next;
        }
    }
    #All are normal
    if(!$abnormal)
    {
        print "HCA status of all nodes is normal.\n";
        print $::LOG_FILE_HANDLE "HCA status of all nodes is normal.\n";
    }
    else
    {
        print "HCA status of all nodes contains unexpected result.\n";
        print $::LOG_FILE_HANDLE "HCA status of all nodes contains unexpected result.\n"
    }
    return $::OK;
}

#-------------------------------------------------------------------------------

=head3    HCACheckLinux

          Check HCA status for Linux nodes
Arguments:
        $refLnxNodes - The reference to the group of Linux nodes.
=cut

#-------------------------------------------------------------------------------
sub HCACheckLinux()
{
    my ($refLnxNodes) = @_;
    my @ValidLnxNodes;
    my $abnormal = 0;
    my $allnodes = join ", ", @$refLnxNodes;
    print "\nChecking HCA status for nodes: $allnodes...\n";
    print $::LOG_FILE_HANDLE "Checking HCA status for nodes: $allnodes...\n";
    
    @ValidLnxNodes = &checkCmdAvailability($refLnxNodes, $::LinuxHCACmd);

    if (!scalar (@ValidLnxNodes))
    {
        return $::NOK;
    }
    my $ValidLnxNodes = join (",", @ValidLnxNodes);
    my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxHCACmd -v | egrep 'ehca|port:|state:|width:|speed:'\" 2>/dev/null";
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
    my @output = `$checkCmd`;
    if ($?)
    {
        print "Command failed: $checkCmd.\n";
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
        return $::NOK;
    }
    
    my $host;
    my $hca_id;
    my $port;
    my $if_id;
    my $ignore = 0;
    foreach my $line (@output)
    {
        chomp $line;
        #Get host name
        if ($line =~ /(\S*): (.*)/)
        {
            $host = $1;
        }
        #Get HCA ID
        if ($line =~ /\S*:\s*hca_id:\s*ehca(\d+)/)
        {
            $hca_id = $1;
            next;
        }
        #Get port number
        if ($line =~ /\S*:\s*port:\s*(\d+)/)
        {
            #Calculate interface number according to HCA ID and port number
            $port = $1;
            $if_id = $hca_id * 2 + $port - 1;
            #If the interface is in ignore list
            if ( grep {$_ eq "ib$if_id"} @IgnoreList )
            {
                $ignore = 1;
            }
            else
            {
                $ignore = 0;
            }
            next;
        }
        #Check state
        if ($line =~ /\S*:\s*state:\s*(\w+)\s*\(.*/ && $ignore == 0)
        {
            if ($1 ne "PORT_ACTIVE" && $1 ne "active")
            {
                $abnormal = 1;
                print "$host: ib$if_id state is $1. UNEXPECTED STATE.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id state is $1. UNEXPECTED STATE.\n";
            }
            else
            {
                print "$host: ib$if_id state is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id state is $1.\n";
            }
            next;
        }
        #Check width
        if ($line =~ /\S*:\s*active_width:\s*(\w+)\s*\(.*/ && $ignore == 0)
        {
            if ($::WIDTH eq "")
            {
                print "$host: ib$if_id active_width is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1.\n";
            }
            elsif ($1 ne $::WIDTH)
            {
                $abnormal = 1;
                print "$host: ib$if_id active_width is $1. UNEXPECTED WIDTH.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1. UNEXPECTED WIDTH.\n";
            }
            else
            {
                print "$host: ib$if_id active_width is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1.\n";
            }
            next;
        }
        #Check speed
        if ($line =~ /\S*:\s*active_speed:\s*(.+)\s*Gbps.*/ && $ignore == 0)
        {
            if (!defined($::HCASPEED))
            {
                print "$host: ib$if_id active_speed is $1Gbps.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps.\n";
            }
            elsif ($1 != $::HCASPEED)
            {
                $abnormal = 1;
                print "$host: ib$if_id active_speed is $1Gbps. UNEXPECTED SPEED.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps. UNEXPECTED SPEED.\n";
            }
            else
            {
                print "$host: ib$if_id active_speed is $1Gbps.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps.\n";
            }
            next;
        }
        #Check Physical State
        if ($line =~ /\S*:\s*phys_state:\s*(\w+)\s*\(.*/ && $ignore == 0)
        {
            if ($1 ne "LINK_UP")
            {
                $abnormal = 1;
                print "$host: ib$if_id phys_state is $1. UNEXPECTED STATE.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id phys_state is $1. UNEXPECTED STATE.\n";
            }
            else
            {
                print "$host: ib$if_id phys_state is $1.\n";
                print $::LOG_FILE_HANDLE "$host: ib$if_id phys_state is $1. \n";
            }
            next;
        }
    }
    #All are normal
    if(!$abnormal)
    {
        print "HCA status of all nodes is normal.\n";
        print $::LOG_FILE_HANDLE "HCA status of all nodes is normal.\n";
    }
    else
    {
        print "HCA status of all nodes contains unexpected result.\n";
        print $::LOG_FILE_HANDLE "HCA status of all nodes contains unexpected re
sult.\n"
    }
    return $::OK;
}