mirror of
				https://github.com/xcat2/xcat-core.git
				synced 2025-11-04 05:12:30 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			1432 lines
		
	
	
		
			43 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
			
		
		
	
	
			1432 lines
		
	
	
		
			43 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
#!/usr/bin/perl
 | 
						|
# IBM(c) 2008 EPL license http://www.eclipse.org/legal/epl-v10.html
 | 
						|
 | 
						|
###########################################################################
 | 
						|
#                                                                         #
 | 
						|
# Command: healthCheck                                                    #
 | 
						|
#                                                                         #
 | 
						|
#-------------------------------------------------------------------------#
 | 
						|
#    This script is used to check the system health for both AIX and
 | 
						|
#    Linux Managed Nodes on Power6 platforms. It will use xdsh to access
 | 
						|
#    the target nodes, and check the status for processor clock speed,
 | 
						|
#    IB interfaces, memory, large page configuration and HCA status.
 | 
						|
#    If xdsh is unreachable, an error message will be given.
 | 
						|
# Command Syntax:
 | 
						|
#    healthCheck { [-n node_list] [-M]}
 | 
						|
#                {[-p min_clock_speed] [-i method] [-m min_memory]
 | 
						|
#                [-l min_freelp] [ -H [--speed speed --ignore interface_list --width width]]}
 | 
						|
#                [ -h ]
 | 
						|
#
 | 
						|
#        -M          Check status for all the Managed Nodes that are defined on this MN.
 | 
						|
#        -n node_list
 | 
						|
#                    Specifies a comma-separated list of node host names, IP addresses for health check.
 | 
						|
#        -p min_clock_speed
 | 
						|
#                    Specifies the minimal processor clock speed in MHz for processor monitor.
 | 
						|
#        -i method
 | 
						|
#                    Specifies the method to do Infiniband interface status check, the supported
 | 
						|
#                    check methods are LL and RSCT.
 | 
						|
#        -m min_memory
 | 
						|
#                    Specifies the minimal total memory in MB.
 | 
						|
#        -l min_freelp
 | 
						|
#                    Specifies the minimal free large page number.
 | 
						|
#        -H          Check the status for HCAs.
 | 
						|
#        --speed speed
 | 
						|
#                    Specifies the physical port speed in G bps, it should be used with -H flag.
 | 
						|
#        --ignore interface_list
 | 
						|
#                    Specifies a comma-separated list of interface name to ignore from HCA status check,
 | 
						|
#                    such as ib0,ib1. It should be used with -H flag.
 | 
						|
#        --width width
 | 
						|
#                    Specifies the physical port width, such as 4X or 12X. It should be used with -H flag.
 | 
						|
#        -h          Display usage information.
 | 
						|
# Exit codes:
 | 
						|
#     0 - success
 | 
						|
#     1 - fail
 | 
						|
###########################################################################
 | 
						|
 | 
						|
use strict;
 | 
						|
use Getopt::Long;
 | 
						|
 | 
						|
# Log file
 | 
						|
$::HEALTHCHECK_PATH = "/var/log/xcat";
 | 
						|
$::HEALTHCHECK_LOG  = "$::HEALTHCHECK_PATH/healthCheck.log";
 | 
						|
 | 
						|
# variables and Commands
 | 
						|
$::OK           = 0;
 | 
						|
$::NOK          = 1;
 | 
						|
$::logging      = 0;
 | 
						|
$::GLOBAL_EXIT  = 0;
 | 
						|
$::NODELS       = "/opt/xcat/bin/nodels";
 | 
						|
$::LinuxProcCmd = "cat /proc/cpuinfo";
 | 
						|
$::AIXProcCmd   = "/usr/pmapi/tools/pmcycles";
 | 
						|
$::IBifLLCmdL   = "/opt/ibmll/LoadL/full/bin/llstatus";
 | 
						|
$::IBifLLCmdA   = "/usr/lpp/LoadL/full/bin/llstatus";
 | 
						|
$::IBifRSCTCmd  = "/usr/bin/lsrsrc";
 | 
						|
$::AIXMemCmd    = "/usr/bin/vmstat";
 | 
						|
$::LinuxMemCmd  = "cat /proc/meminfo";
 | 
						|
$::AIXHCACmd    = "/usr/bin/ibstat";
 | 
						|
$::LinuxHCACmd  = "/usr/bin/ibv_devinfo";
 | 
						|
 | 
						|
#Nodes to be checked
 | 
						|
my @NodeList;
 | 
						|
my @LnxNodeList;
 | 
						|
my @AIXNodeList;
 | 
						|
 | 
						|
#Ignored ib interfaces when checking HCA status
 | 
						|
my @IgnoreList;
 | 
						|
 | 
						|
# MAIN Main main#
 | 
						|
# Append logging information to getGuids.log
 | 
						|
&append_logging($::HEALTHCHECK_LOG);
 | 
						|
$::logging++;
 | 
						|
 | 
						|
&getArgs;
 | 
						|
 | 
						|
my @ReachableNodes;
 | 
						|
my @UnreachableNodes;
 | 
						|
if (scalar(@NodeList) > 0)
 | 
						|
{
 | 
						|
    # Check if xdsh is reachable
 | 
						|
    foreach my $node (@NodeList)
 | 
						|
    {
 | 
						|
        my $rc = &checkDshReachability($node);
 | 
						|
        if ($rc == 0)
 | 
						|
        {    # xdsh is ok
 | 
						|
            push @ReachableNodes, $node;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            push @UnreachableNodes, $node;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    if (scalar(@UnreachableNodes))
 | 
						|
    {
 | 
						|
        my $UnreachableNodes = join(", ", @UnreachableNodes);
 | 
						|
        print "Warning: The xdsh is unreachable for the node(s): $UnreachableNodes. Run updatenode to configure xdsh for that node(s).\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Warning: The xdsh is unreachable for the node(s): $UnreachableNodes. Run updatenode to configure xdsh for that node(s).\n";
 | 
						|
    }
 | 
						|
}
 | 
						|
else
 | 
						|
{
 | 
						|
    print "Error: No hosts in node list\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Error: No hosts in node list\n";
 | 
						|
    $::GLOBAL_EXIT = $::NOK;
 | 
						|
    exit;
 | 
						|
}
 | 
						|
 | 
						|
#Divide the nodes into two groups: Linux nodes and AIX nodes
 | 
						|
my @AllLnxNodes = `$::NODELS all nodetype.os | grep -E "sles|rhel"`;
 | 
						|
print $::LOG_FILE_HANDLE "Running command: $::NODELS all nodetype.os | grep -E sles|rhel\n";
 | 
						|
chomp @AllLnxNodes;
 | 
						|
my @AllAIXNodes = `$::NODELS all nodetype.os | grep "AIX"`;
 | 
						|
print $::LOG_FILE_HANDLE "Running command: $::NODELS all nodetype.os | grep AIX\n";
 | 
						|
chomp @AllAIXNodes;
 | 
						|
 | 
						|
foreach my $node (@ReachableNodes)
 | 
						|
{
 | 
						|
    if (grep { $_ =~ /$node/ } @AllLnxNodes)
 | 
						|
    {
 | 
						|
        push @LnxNodeList, $node;
 | 
						|
    }
 | 
						|
    if (grep { $_ =~ /$node/ } @AllAIXNodes)
 | 
						|
    {
 | 
						|
        push @AIXNodeList, $node;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#Do health check for Linux nodes and AIX nodes
 | 
						|
if (scalar(@LnxNodeList))
 | 
						|
{
 | 
						|
    my $rc = &healthCheckLinux(\@LnxNodeList);
 | 
						|
    if ($rc)
 | 
						|
    {
 | 
						|
        $::GLOBAL_EXIT = $rc;
 | 
						|
        exit;
 | 
						|
    }
 | 
						|
}
 | 
						|
if (scalar(@AIXNodeList))
 | 
						|
{
 | 
						|
    my $rc = &healthCheckAIX(\@AIXNodeList);
 | 
						|
    if ($rc)
 | 
						|
    {
 | 
						|
        $::GLOBAL_EXIT = $rc;
 | 
						|
        exit;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
# Finish up and exit
 | 
						|
END
 | 
						|
{
 | 
						|
    if ($::logging)
 | 
						|
    {
 | 
						|
        &stop_logging();
 | 
						|
    }
 | 
						|
 | 
						|
    #Determine exit code
 | 
						|
    if ($::GLOBAL_EXIT > $?)
 | 
						|
    {
 | 
						|
        $? = $::GLOBAL_EXIT;
 | 
						|
    }
 | 
						|
}
 | 
						|
exit;    # end of Main
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3  getArgs
 | 
						|
        Parse the command line and check the values
 | 
						|
=cut
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
sub getArgs()
 | 
						|
{
 | 
						|
    $Getopt::Long::ignorecase = 0;
 | 
						|
    my $rc = GetOptions(
 | 
						|
        'h'        => \$::HELP,
 | 
						|
        'M'        => \$::ALLMANAGED,
 | 
						|
        'n=s'      => \$::NODE_LIST,
 | 
						|
        'p=f'      => \$::MIN_CLOCK_SPEED,
 | 
						|
        'i=s'      => \$::IF_CHECK_METHOD,
 | 
						|
        'm=f'      => \$::MIN_MEMORY,
 | 
						|
        'l=i'      => \$::MIN_FREELP,
 | 
						|
        'H'        => \$::HCA,
 | 
						|
        'speed=f'  => \$::HCASPEED,
 | 
						|
        'ignore=s' => \$::IGNORE_LIST,
 | 
						|
        'width=s'  => \$::WIDTH
 | 
						|
    );
 | 
						|
    if ($::HELP)
 | 
						|
    {
 | 
						|
        &usage();
 | 
						|
        $::GLOBAL_EXIT = $::OK;
 | 
						|
        exit;
 | 
						|
    }
 | 
						|
    if (!$rc)
 | 
						|
    {    #There are syntax errors in parameters
 | 
						|
        &usage();
 | 
						|
        $::GLOBAL_EXIT = $::NOK;
 | 
						|
        exit;
 | 
						|
    }
 | 
						|
    if ($::IF_CHECK_METHOD)
 | 
						|
    {
 | 
						|
        if ($::IF_CHECK_METHOD ne "LL" && $::IF_CHECK_METHOD ne "RSCT")
 | 
						|
        {
 | 
						|
            print "The method to check IB interface must be \"LL\" or \"RSCT\".\n";
 | 
						|
            &usage();
 | 
						|
            $::GLOBAL_EXIT = $::NOK;
 | 
						|
            exit;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (!defined($::MIN_CLOCK_SPEED) && !defined($::IF_CHECK_METHOD) && !defined($::MIN_MEMORY) && !defined($::MIN_FREELP) && !defined($::HCA))
 | 
						|
    {
 | 
						|
        print "There is nothing to check. Please specify what you want to check.\n";
 | 
						|
        &usage();
 | 
						|
        $::GLOBAL_EXIT = $::NOK;
 | 
						|
        exit;
 | 
						|
    }
 | 
						|
 | 
						|
    if (!defined($::HCA) && (defined($::HCASPEED) || $::IGNORE_LIST ne "" || $::WIDTH ne ""))
 | 
						|
    {
 | 
						|
        print "The flag --speed, --ignore or --width should be used with -H flag.\n";
 | 
						|
        &usage();
 | 
						|
        $::GLOBAL_EXIT = $::NOK;
 | 
						|
        exit;
 | 
						|
    }
 | 
						|
 | 
						|
    if ($::WIDTH ne "")
 | 
						|
    {
 | 
						|
        if (!($::WIDTH =~ /^(\d+)X$/))
 | 
						|
        {
 | 
						|
            print "The parameter of --width should be 4X, 12X, etc.\n";
 | 
						|
            &usage();
 | 
						|
            $::GLOBAL_EXIT = $::NOK;
 | 
						|
            exit;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #Parse the node list
 | 
						|
    if ($::NODE_LIST)
 | 
						|
    {
 | 
						|
        @NodeList = `$::NODELS $::NODE_LIST`; chomp @NodeList;
 | 
						|
        if ($?) {
 | 
						|
            print $::LOG_FILE_HANDLE "Running command: $::NODELS $::NODE_LIST fail\n";
 | 
						|
            exit;
 | 
						|
        } else {
 | 
						|
            print $::LOG_FILE_HANDLE "Running command: $::NODELS $::NODE_LIST\n";
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #If -M is set, list all nodes in Managed mode
 | 
						|
    if ($::ALLMANAGED)
 | 
						|
    {
 | 
						|
        my @templist = `$::NODELS`;
 | 
						|
        chomp @templist;
 | 
						|
        print $::LOG_FILE_HANDLE "Running command: lsnode -w \"Mode='Managed' or Mode='MinManaged'\"\n";
 | 
						|
        @NodeList = @templist;
 | 
						|
    }
 | 
						|
    if ($::IGNORE_LIST)
 | 
						|
    {
 | 
						|
        @IgnoreList = split /,/, $::IGNORE_LIST;
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3  usage
 | 
						|
 | 
						|
        usage for healthCheck
 | 
						|
=cut
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
sub usage()
 | 
						|
{
 | 
						|
    print
 | 
						|
      "Usage: healthCheck { [-n node_list] [-M]}
 | 
						|
                    {[-p min_clock_speed] [-i method] [-m min_memory]
 | 
						|
                    [-l min_freelp] [ -H [--speed speed --ignore interface_list --width width]]}
 | 
						|
                    [ -h ]
 | 
						|
 | 
						|
        -M          Check status for all the Managed Nodes that are defined on this MS.
 | 
						|
        -n node_list
 | 
						|
                    Specifies a comma-separated list of node host names, IP addresses for health check.
 | 
						|
        -p min_clock_speed
 | 
						|
                    Specifies the minimal processor clock speed in MHz for processor monitor.
 | 
						|
        -i method
 | 
						|
                    Specifies the method to do Infiniband interface status check, the supported
 | 
						|
                    check methods are LL and RSCT.
 | 
						|
        -m min_memory
 | 
						|
                    Specifies the minimal total memory in MB.
 | 
						|
        -l min_freelp
 | 
						|
                    Specifies the minimal free large page number.
 | 
						|
        -H          Check the status for HCAs.
 | 
						|
        --speed speed
 | 
						|
                    Specifies the physical port speed in G bps, it should be used with -H flag.
 | 
						|
        --ignore interface_list
 | 
						|
                    Specifies a comma-separated list of interface name to ignore from HCA status check,
 | 
						|
                    such as ib0,ib1. It should be used with -H flag.
 | 
						|
        --width width
 | 
						|
                    Specifies the physical port width, such as 4X or 12X. It should be used with -H flag.
 | 
						|
        -h          Display usage information.
 | 
						|
";
 | 
						|
}
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3   checkDshReachability
 | 
						|
 | 
						|
        Notes: Check the xdsh reachability between the Management Server
 | 
						|
               and node.
 | 
						|
 | 
						|
Arguments:
 | 
						|
        $node - the remote node hostname.
 | 
						|
 | 
						|
Returns:
 | 
						|
        $::OK - The remote node is reachable through xdsh.
 | 
						|
        $::NOK - The remote node is unreachable through xdsh.
 | 
						|
 | 
						|
=cut
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
sub checkDshReachability()
 | 
						|
{
 | 
						|
    my ($node) = @_;
 | 
						|
    my $output = `xdsh $node date 2>/dev/null`;
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: xdsh $node date 2>/dev/null\n";
 | 
						|
    if ($? == $::OK)
 | 
						|
    {
 | 
						|
        return $::OK;
 | 
						|
    }
 | 
						|
    return $::NOK;
 | 
						|
}
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3   checkCmdAvailability
 | 
						|
 | 
						|
        Check availability of some command on the nodes.
 | 
						|
 | 
						|
Arguments:
 | 
						|
        $refNodes - The reference to a group of nodes.
 | 
						|
        $cmd - The command that will be checked.
 | 
						|
Returns:
 | 
						|
        @ValidNodes - A group of nodes that the command is available on these nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#--------------------------------------------------------------------------------
 | 
						|
sub checkCmdAvailability
 | 
						|
{
 | 
						|
    my ($refNodes, $cmd) = @_;
 | 
						|
    my @ValidNodes;
 | 
						|
    my @BadNodes;
 | 
						|
    foreach my $node (@$refNodes)
 | 
						|
    {
 | 
						|
        #Use xdsh to "ls" the command
 | 
						|
        `xdsh $node ls $cmd 2>/dev/null`;
 | 
						|
        print $::LOG_FILE_HANDLE "Running command: xdsh $node ls $cmd 2>/dev/null\n";
 | 
						|
        if ($? == $::OK)
 | 
						|
        {
 | 
						|
            push @ValidNodes, $node;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            push @BadNodes, $node;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    if (scalar(@BadNodes))
 | 
						|
    {
 | 
						|
        my $BadNodes = join(", ", @BadNodes);
 | 
						|
        print "Warning: The command $cmd is not available on the node(s): $BadNodes.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Warning: The command $cmd is not available on the node(s): $BadNodes.\n";
 | 
						|
    }
 | 
						|
    return @ValidNodes;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    append_logging
 | 
						|
 | 
						|
	Append logging messages to a logfile.
 | 
						|
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub append_logging()
 | 
						|
{
 | 
						|
    my ($logfile) = @_;
 | 
						|
    my ($cmd, $rc);
 | 
						|
 | 
						|
    #  get log file ready
 | 
						|
    if (!-e $logfile)
 | 
						|
    {
 | 
						|
        `mkdir -p $::HEALTHCHECK_PATH`;
 | 
						|
 | 
						|
        #  create the log file if not already there
 | 
						|
        unless (open(LOGFILE, ">$logfile"))
 | 
						|
        {
 | 
						|
            # Cannot open file
 | 
						|
            print "Can't open file \"$logfile\" for writing.\n";
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        # it's there so just append
 | 
						|
        unless (open(LOGFILE, ">>$logfile"))
 | 
						|
        {
 | 
						|
            print "Can't update file  \"$logfile\".\n";
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    $::LOG_FILE_HANDLE = \*LOGFILE;
 | 
						|
 | 
						|
    # Print the date to the top of the logfile
 | 
						|
    my $sdate = `/bin/date`;
 | 
						|
    chomp $sdate;
 | 
						|
    print "Output log is being written to \"$logfile\".\n";
 | 
						|
 | 
						|
    print $::LOG_FILE_HANDLE
 | 
						|
      "---------------------------------------------------------------------\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Logging started $sdate.\n";
 | 
						|
    print $::LOG_FILE_HANDLE
 | 
						|
      "---------------------------------------------------------------------\n";
 | 
						|
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    stop_logging
 | 
						|
 | 
						|
	Turn off message logging.
 | 
						|
	
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub stop_logging()
 | 
						|
{
 | 
						|
    # Print the date at the bottom of the logfile
 | 
						|
    my $sdate = `/bin/date`;
 | 
						|
    chomp $sdate;
 | 
						|
    print $::LOG_FILE_HANDLE
 | 
						|
      "---------------------------------------------------------------------\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Logging stopped $sdate.\n";
 | 
						|
    print $::LOG_FILE_HANDLE
 | 
						|
      "---------------------------------------------------------------------\n";
 | 
						|
 | 
						|
    close($::LOG_FILE_HANDLE);
 | 
						|
    $::LOG_FILE_HANDLE = undef;
 | 
						|
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    healthCheckLinux
 | 
						|
 | 
						|
          Check Linux nodes
 | 
						|
Arguments:
 | 
						|
        $refLnxNodes - The reference to the group of Linux nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub healthCheckLinux()
 | 
						|
{
 | 
						|
    my ($refLnxNodes) = @_;
 | 
						|
    my $allnodes = join ", ", @$refLnxNodes;
 | 
						|
    print "Checking health for Linux nodes: $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking health for Linux nodes: $allnodes...\n";
 | 
						|
    my $rc;
 | 
						|
    if (defined($::MIN_CLOCK_SPEED))
 | 
						|
    {
 | 
						|
        $rc = procsrCheckLinux($refLnxNodes);
 | 
						|
        if ($rc != $::OK) {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (defined($::MIN_MEMORY) || defined($::MIN_FREELP))
 | 
						|
    {
 | 
						|
        $rc = memCheckLinux($refLnxNodes);
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if ($::IF_CHECK_METHOD ne "")
 | 
						|
    {
 | 
						|
        $rc = ifCheck($refLnxNodes, 'Linux');
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (defined($::HCA))
 | 
						|
    {
 | 
						|
        $rc = HCACheckLinux($refLnxNodes);
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    healthCheckAIX
 | 
						|
 | 
						|
          Check AIX nodes
 | 
						|
Arguments:
 | 
						|
        $refAIXNodes - The reference to the group of AIX nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub healthCheckAIX()
 | 
						|
{
 | 
						|
    my ($refAIXNodes) = @_;
 | 
						|
    my $allnodes = join ", ", @$refAIXNodes;
 | 
						|
    print "Checking health for AIX nodes: $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking health for AIX nodes: $allnodes...\n";
 | 
						|
 | 
						|
    my $rc;
 | 
						|
    if (defined($::MIN_CLOCK_SPEED))
 | 
						|
    {
 | 
						|
        $rc = procsrCheckAIX($refAIXNodes);
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (defined($::MIN_MEMORY) || defined($::MIN_FREELP))
 | 
						|
    {
 | 
						|
        $rc = memCheckAIX($refAIXNodes);
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if ($::IF_CHECK_METHOD ne "")
 | 
						|
    {
 | 
						|
        $rc = ifCheck($refAIXNodes, 'AIX');
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (defined($::HCA))
 | 
						|
    {
 | 
						|
        $rc = HCACheckAIX($refAIXNodes);
 | 
						|
        if ($rc != $::OK)
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    procsrCheckLinux
 | 
						|
 | 
						|
          Check processor clock speed for Linux nodes
 | 
						|
Arguments:
 | 
						|
        $refLnxNodes - The reference to the group of Linux nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub procsrCheckLinux()
 | 
						|
{
 | 
						|
    my ($refLnxNodes) = @_;
 | 
						|
    my $ValidLnxNodes = join(",", @$refLnxNodes);
 | 
						|
 | 
						|
    print "\nChecking processor clock speed for nodes: $ValidLnxNodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking processor clock speed for nodes: $ValidLnxNodes...\n";
 | 
						|
 | 
						|
    my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxProcCmd\" 2>/dev/null";
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
    my @output = `$checkCmd`;
 | 
						|
    if ($?)
 | 
						|
    {
 | 
						|
        print "Command failed: $checkCmd.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $host;
 | 
						|
    my $pcrnum;
 | 
						|
    my $clspeed;
 | 
						|
    my %result;
 | 
						|
    foreach my $line (@output)
 | 
						|
    {
 | 
						|
        chomp $line;
 | 
						|
 | 
						|
        # Get node hostname
 | 
						|
        if ($line =~ /(\S*):.*/)
 | 
						|
        {
 | 
						|
            $host = $1;
 | 
						|
        }
 | 
						|
        if ($line =~ /\S+: processor\s*: (\d+)/)
 | 
						|
        {
 | 
						|
            #Get processor number
 | 
						|
            $pcrnum = $1;
 | 
						|
        }
 | 
						|
        if ($line =~ /\S+: clock\s*: (.*)MHz/)
 | 
						|
        {
 | 
						|
            #Get processor clock speed
 | 
						|
            $clspeed = $1;
 | 
						|
            if ($clspeed < $::MIN_CLOCK_SPEED)
 | 
						|
            {
 | 
						|
                $result{$host} = "$result{$host}Processor $pcrnum clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (%result)
 | 
						|
    {
 | 
						|
        foreach my $h (sort keys %result)
 | 
						|
        {
 | 
						|
            print "$h:\n$result{$h}";
 | 
						|
            print $::LOG_FILE_HANDLE "$h:\n$result{$h}";
 | 
						|
        }
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        print "The processor clock speed of all nodes is normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "The processor clock speed of all nodes is normal.\n";
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    procsrCheckAIX
 | 
						|
 | 
						|
	      Check processor clock speed for AIX nodes
 | 
						|
Arguments:
 | 
						|
        $refAIXNodes - The reference to the group of AIX nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub procsrCheckAIX()
 | 
						|
{
 | 
						|
    my ($refAIXNodes) = @_;
 | 
						|
    my @ValidAIXNodes;
 | 
						|
    my $abnormal = 0;
 | 
						|
    my $allnodes = join ", ", @$refAIXNodes;
 | 
						|
    print "\nChecking processor clock speed for nodes: $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking processor clock speed for nodes: $allnodes...\n";
 | 
						|
 | 
						|
    @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXProcCmd);
 | 
						|
 | 
						|
    if (!scalar(@ValidAIXNodes))
 | 
						|
    {
 | 
						|
        return $::NOK;
 | 
						|
    }
 | 
						|
    my $ValidAIXNodes = join(",", @ValidAIXNodes);
 | 
						|
    my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXProcCmd\" 2>/dev/null";
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
    my @output = `$checkCmd`;
 | 
						|
    if ($?)
 | 
						|
    {
 | 
						|
        print "Command failed: $checkCmd.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $host;
 | 
						|
    my $pcrnum;
 | 
						|
    my $clspeed;
 | 
						|
    foreach my $line (@output)
 | 
						|
    {
 | 
						|
        chomp $line;
 | 
						|
 | 
						|
        # Get node hostname
 | 
						|
        if ($line =~ /(\S*):.*/)
 | 
						|
        {
 | 
						|
            $host = $1;
 | 
						|
        }
 | 
						|
        if ($line =~ /\S+:\s*This machine runs at (\d+) MHz/)
 | 
						|
        {
 | 
						|
            #Get processor clock speed
 | 
						|
            $clspeed = $1;
 | 
						|
            if ($clspeed < $::MIN_CLOCK_SPEED)
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host processor clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host processor clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (!$abnormal)
 | 
						|
    {
 | 
						|
        print "The processor clock speed of all nodes is normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "The processor clock speed of all nodes is normal.\n";
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    ifCheck
 | 
						|
 | 
						|
          Check ib interfaces status
 | 
						|
Arguments:
 | 
						|
        $refNodes - The reference to all nodes.
 | 
						|
        $os - Linux or AIX
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub ifCheck()
 | 
						|
{
 | 
						|
    my ($refNodes, $os) = @_;
 | 
						|
    my @ValidNodes;
 | 
						|
    my $cmd;
 | 
						|
    my $abnormal = 0;
 | 
						|
    my $allnodes = join ", ", @$refNodes;
 | 
						|
 | 
						|
    if ($::IF_CHECK_METHOD eq "LL")
 | 
						|
    {
 | 
						|
        if ($os eq 'Linux')
 | 
						|
        {
 | 
						|
            $cmd = $::IBifLLCmdL;
 | 
						|
        }
 | 
						|
        elsif ($os eq 'AIX')
 | 
						|
        {
 | 
						|
            $cmd = $::IBifLLCmdA;
 | 
						|
        }
 | 
						|
        else
 | 
						|
        {
 | 
						|
            return $::NOK;
 | 
						|
        }
 | 
						|
    }
 | 
						|
    elsif ($::IF_CHECK_METHOD eq "RSCT")
 | 
						|
    {
 | 
						|
        $cmd = $::IBifRSCTCmd;
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        return $::NOK;
 | 
						|
    }
 | 
						|
 | 
						|
    print "\nChecking IB interface status using command $cmd for nodes: $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking IB interface status using command $cmd for nodes: $allnodes...\n";
 | 
						|
 | 
						|
    @ValidNodes = &checkCmdAvailability($refNodes, $cmd);
 | 
						|
 | 
						|
    if (!scalar(@ValidNodes))
 | 
						|
    {
 | 
						|
        return $::NOK;
 | 
						|
    }
 | 
						|
 | 
						|
    my $ValidNodes = join(",", @ValidNodes);
 | 
						|
 | 
						|
    #Method is "LL"
 | 
						|
    if ($::IF_CHECK_METHOD eq "LL")
 | 
						|
    {
 | 
						|
        #Run the LL command on only one node, this is enough because it will return status of all nodes
 | 
						|
        my $checkCmd = "xdsh $ValidNodes[0] \"$cmd -a\" 2>/dev/null";
 | 
						|
        print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
        my @output = `$checkCmd`;
 | 
						|
        if ($?)
 | 
						|
        {
 | 
						|
            print "Command failed: $checkCmd.\n";
 | 
						|
            print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
        }
 | 
						|
 | 
						|
        #Record whether the next line is the beginning of a new section
 | 
						|
        my $armed = 0;
 | 
						|
        my $ibnum;
 | 
						|
        my $host;
 | 
						|
 | 
						|
        #Record abnormal ib interfaces of a node
 | 
						|
        my @ib_array = ();
 | 
						|
        foreach my $line (@output)
 | 
						|
        {
 | 
						|
            chomp $line;
 | 
						|
 | 
						|
            #The text returned by llstatus is seperared into several sections by "=================="
 | 
						|
            if ($line =~ /==================/)
 | 
						|
            {
 | 
						|
                #If there are abnormal interfaces found after check last section
 | 
						|
                if (@ib_array)
 | 
						|
                {
 | 
						|
                    print "$host IB interface(s) down: ";
 | 
						|
                    print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
 | 
						|
                    foreach (sort @ib_array)
 | 
						|
                    {
 | 
						|
                        print "ib$_ ";
 | 
						|
                        print $::LOG_FILE_HANDLE "ib$_ ";
 | 
						|
                    }
 | 
						|
                    print "\n";
 | 
						|
                    print $::LOG_FILE_HANDLE "\n";
 | 
						|
                }
 | 
						|
                @ib_array = ();
 | 
						|
                $armed    = 1;
 | 
						|
            }
 | 
						|
            elsif (($armed) && ($line =~ /\S+?:\s*(\S+)/))
 | 
						|
            {    #The first line of a new section
 | 
						|
                    #Get node hostname
 | 
						|
                $host  = $1;
 | 
						|
                $armed = 0;
 | 
						|
            }
 | 
						|
            elsif ($line =~ /\S+?:\s*ib(\d+).*/)
 | 
						|
            {
 | 
						|
                #Get interface number
 | 
						|
                $ibnum = $1;
 | 
						|
 | 
						|
                #Check the status
 | 
						|
                if (!($line =~ /READY/))
 | 
						|
                {
 | 
						|
                    $abnormal = 1;
 | 
						|
                    push @ib_array, $ibnum;
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        #If there are abnormal interfaces found after check the last section
 | 
						|
        if (@ib_array)
 | 
						|
        {
 | 
						|
            print "$host IB interface(s) down: ";
 | 
						|
            print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
 | 
						|
            foreach (sort @ib_array)
 | 
						|
            {
 | 
						|
                print "ib$_ ";
 | 
						|
                print $::LOG_FILE_HANDLE "ib$_ ";
 | 
						|
            }
 | 
						|
            print "\n";
 | 
						|
            print $::LOG_FILE_HANDLE "\n";
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #Method is "RSCT"
 | 
						|
    elsif ($::IF_CHECK_METHOD eq "RSCT")
 | 
						|
    {
 | 
						|
        my $checkCmd = "xdsh $ValidNodes \"$::IBifRSCTCmd IBM.NetworkInterface Name OpState\" 2>/dev/null";
 | 
						|
        print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
        my @output = `$checkCmd`;
 | 
						|
        if ($?)
 | 
						|
        {
 | 
						|
            print "Command failed: $checkCmd.\n";
 | 
						|
            print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
        }
 | 
						|
        my $host;
 | 
						|
        my $prevhost;
 | 
						|
        my $ifname;
 | 
						|
        my $opstate;
 | 
						|
        my @ib_array = ();
 | 
						|
        foreach my $line (@output)
 | 
						|
        {
 | 
						|
            chomp $line;
 | 
						|
            if ($line =~ /(\S*):.*/)
 | 
						|
            {
 | 
						|
                #Get node hostname
 | 
						|
                $host = $1;
 | 
						|
 | 
						|
                #If the host is not equal prevhost, it indicates this is beginning of a section for another node.
 | 
						|
                #If there are abnormal interfaces found after check last node, print them.
 | 
						|
                if (($host ne $prevhost) && @ib_array)
 | 
						|
                {
 | 
						|
                    print "$prevhost IB interface(s) down: ";
 | 
						|
                    print $::LOG_FILE_HANDLE "$prevhost IB interface(s) down: ";
 | 
						|
                    foreach (sort @ib_array)
 | 
						|
                    {
 | 
						|
                        print "$_ ";
 | 
						|
                        print $::LOG_FILE_HANDLE "$_ ";
 | 
						|
                    }
 | 
						|
                    print "\n";
 | 
						|
                    print $::LOG_FILE_HANDLE "\n";
 | 
						|
                    @ib_array = ();
 | 
						|
                }
 | 
						|
                $prevhost = $host;
 | 
						|
            }
 | 
						|
            if ($line =~ /\S*:\s*Name\s*=\s*\"(.*)\"/)
 | 
						|
            {
 | 
						|
                #Get interface name
 | 
						|
                $ifname = $1;
 | 
						|
            }
 | 
						|
            if ($line =~ /\S+:\s*OpState\s*=\s*(\d+)/)
 | 
						|
            {
 | 
						|
                #Check the status
 | 
						|
                $opstate = $1;
 | 
						|
                if ($opstate != 1 && $ifname =~ /ib/)
 | 
						|
                {
 | 
						|
                    $abnormal = 1;
 | 
						|
                    push @ib_array, $ifname;
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        #If there are abnormal interfaces found after check the last node, print them.
 | 
						|
        if (@ib_array)
 | 
						|
        {
 | 
						|
            print "$host IB interface(s) down: ";
 | 
						|
            print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
 | 
						|
            foreach (sort @ib_array)
 | 
						|
            {
 | 
						|
                print "$_ ";
 | 
						|
                print $::LOG_FILE_HANDLE "$_ ";
 | 
						|
            }
 | 
						|
            print "\n";
 | 
						|
            print $::LOG_FILE_HANDLE "\n";
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (!$abnormal)
 | 
						|
    {
 | 
						|
        print "IB interfaces of all nodes are normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "IB interfaces of all nodes are normal.\n";
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    memCheckLinux
 | 
						|
 | 
						|
          Check memory and free large page for Linux nodes
 | 
						|
Arguments:
 | 
						|
        $refLnxNodes - The reference to the group of Linux nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub memCheckLinux()
 | 
						|
{
 | 
						|
    my ($refLnxNodes) = @_;
 | 
						|
    my $abnormalmem   = 0;
 | 
						|
    my $abnormalflp   = 0;
 | 
						|
    my $ValidLnxNodes = join(",", @$refLnxNodes);
 | 
						|
 | 
						|
    print "\nChecking memory for nodes: $ValidLnxNodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking memory for nodes: $ValidLnxNodes...\n";
 | 
						|
 | 
						|
    my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxMemCmd\" 2>/dev/null";
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
    my @output = `$checkCmd`;
 | 
						|
    if ($?)
 | 
						|
    {
 | 
						|
        print "Command failed: $checkCmd.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $host;
 | 
						|
    my $mem;
 | 
						|
    my $freelp;
 | 
						|
    foreach my $line (@output)
 | 
						|
    {
 | 
						|
        chomp $line;
 | 
						|
        if ($line =~ /(\S*): MemTotal:\s*(\d+) kB/ && defined($::MIN_MEMORY))
 | 
						|
        {
 | 
						|
            # Get node hostname and memory size
 | 
						|
            $host = $1;
 | 
						|
            $mem  = $2;
 | 
						|
            if ($mem < ($::MIN_MEMORY * 1000))
 | 
						|
            {
 | 
						|
                $abnormalmem = 1;
 | 
						|
                my $mem2 = $mem / 1000;
 | 
						|
                print "$host total memory is $mem2 MB. UNEXPECTED small size.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host total memory is $mem kB. UNEXPECTED small size.\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
        if ($line =~ /(\S*): HugePages_Free:\s*(\d+)/ && defined($::MIN_FREELP))
 | 
						|
        {
 | 
						|
            # Get node hostname and number of free large page
 | 
						|
            $host   = $1;
 | 
						|
            $freelp = $2;
 | 
						|
            if ($freelp < $::MIN_FREELP)
 | 
						|
            {
 | 
						|
                $abnormalflp = 1;
 | 
						|
                print "$host Free large page is $freelp. UNEXPECTED small size.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host Free large page is $freelp. UNEXPECTED small size.\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (!$abnormalmem && defined($::MIN_MEMORY))
 | 
						|
    {
 | 
						|
        print "Memory size of all nodes are normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Memory size of all nodes are normal.\n"
 | 
						|
    }
 | 
						|
    if (!$abnormalflp && defined($::MIN_FREELP))
 | 
						|
    {
 | 
						|
        print "Free large page number of all nodes are normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Free large page number of all nodes are normal.\n";
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    memCheckAIX
 | 
						|
 | 
						|
          Check memory and free large page for AIX nodes
 | 
						|
Arguments:
 | 
						|
        $refAIXNodes - The reference to the group of AIX nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub memCheckAIX()
 | 
						|
{
 | 
						|
    my ($refAIXNodes) = @_;
 | 
						|
    my $abnormalmem   = 0;
 | 
						|
    my $abnormalflp   = 0;
 | 
						|
    my @ValidAIXNodes;
 | 
						|
    my $allnodes = join ", ", @$refAIXNodes;
 | 
						|
    print "\nChecking memory for nodes $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking memory for nodes $allnodes...\n";
 | 
						|
 | 
						|
    @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXMemCmd);
 | 
						|
 | 
						|
    if (!scalar(@ValidAIXNodes))
 | 
						|
    {
 | 
						|
        return $::NOK;
 | 
						|
    }
 | 
						|
    my $ValidAIXNodes = join(",", @ValidAIXNodes);
 | 
						|
    my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXMemCmd -l\" 2>/dev/null";
 | 
						|
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
    my @output = `$checkCmd`;
 | 
						|
    if ($?)
 | 
						|
    {
 | 
						|
        print "Command failed: $checkCmd.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $host;
 | 
						|
    my $mem;
 | 
						|
    my $freelp;
 | 
						|
    foreach my $line (@output)
 | 
						|
    {
 | 
						|
        chomp $line;
 | 
						|
        if ($line =~ /(\S*):\s+System Configuration: lcpu=(\d+)\s+mem=(\d+)\s*MB/ && defined($::MIN_MEMORY))
 | 
						|
        {
 | 
						|
            # Get node hostname and memory size
 | 
						|
            $host = $1;
 | 
						|
            $mem  = $3;
 | 
						|
            if ($mem < $::MIN_MEMORY)
 | 
						|
            {
 | 
						|
                $abnormalmem = 1;
 | 
						|
                print "$host total memory is $mem MB. UNEXPECTED small size\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host total memory is $mem MB. UNEXPECTED small size\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
        elsif ($line =~ /(\S*):\s*\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)/ && defined($::MIN_FREELP))
 | 
						|
        {
 | 
						|
            # Get node hostname and number of free large page
 | 
						|
            $host   = $1;
 | 
						|
            $freelp = $2;
 | 
						|
            if ($freelp < $::MIN_FREELP)
 | 
						|
            {
 | 
						|
                $abnormalflp = 1;
 | 
						|
                print "$host Free large page is $freelp. UNEXPECTED small size\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host Free large page is $freelp. UNEXPECTED small size\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
        elsif ($line =~ /(\S*):.*lcpu=(\d+)\s+mem=(\d+)MB/ && defined($::MIN_MEMORY))
 | 
						|
        {
 | 
						|
            # Get node hostname and memory size
 | 
						|
            $host = $1;
 | 
						|
            $mem  = $3;
 | 
						|
            if ($mem < $::MIN_MEMORY)
 | 
						|
            {
 | 
						|
                $abnormalmem = 1;
 | 
						|
                print "$host total memory is $mem MB. UNEXPECTED small size\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host total memory is $mem MB. UNEXPECTED small size\n";
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
    if (!$abnormalmem && defined($::MIN_MEMORY))
 | 
						|
    {
 | 
						|
        print "Memory size of all nodes are normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Memory size of all nodes are normal.\n"
 | 
						|
    }
 | 
						|
    if (!$abnormalflp && defined($::MIN_FREELP))
 | 
						|
    {
 | 
						|
        print "Free large page number of all nodes is normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Free large page number of all nodes is normal.\n";
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    HCACheckAIX
 | 
						|
 | 
						|
          Check HCA status for AIX nodes
 | 
						|
Arguments:
 | 
						|
        $refAIXNodes - The reference to the group of AIX nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub HCACheckAIX()
 | 
						|
{
 | 
						|
    my ($refAIXNodes) = @_;
 | 
						|
    my @ValidAIXNodes;
 | 
						|
    my $abnormal = 0;
 | 
						|
    my $allnodes = join ", ", @$refAIXNodes;
 | 
						|
    print "\nChecking HCA status for nodes $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking HCA status for nodes $allnodes...\n";
 | 
						|
 | 
						|
    @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXHCACmd);
 | 
						|
 | 
						|
    if (!scalar(@ValidAIXNodes))
 | 
						|
    {
 | 
						|
        return $::NOK;
 | 
						|
    }
 | 
						|
    my $ValidAIXNodes = join(",", @ValidAIXNodes);
 | 
						|
    my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXHCACmd -v | egrep 'IB PORT.*INFO|Port State:|Physical Port'\" 2>/dev/null";
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
    my @output = `$checkCmd`;
 | 
						|
    if ($?)
 | 
						|
    {
 | 
						|
        print "Command failed: $checkCmd.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $host;
 | 
						|
    my $hca_id;
 | 
						|
    my $port;
 | 
						|
    my $if_id;
 | 
						|
    my $ignore = 0;
 | 
						|
    foreach my $line (@output)
 | 
						|
    {
 | 
						|
        chomp $line;
 | 
						|
 | 
						|
        #Get host name
 | 
						|
        if ($line =~ /(\S*): (.*)/)
 | 
						|
        {
 | 
						|
            $host = $1;
 | 
						|
        }
 | 
						|
 | 
						|
        #Get HCA ID and port number
 | 
						|
        if ($line =~ /\S*:\s*IB PORT (\d+) INFORMATION \(iba(\d+)\)/)
 | 
						|
        {
 | 
						|
            $port   = $1;
 | 
						|
            $hca_id = $2;
 | 
						|
 | 
						|
            #Calculate interface number according to HCA ID and port number
 | 
						|
            $if_id = $hca_id * 2 + $port - 1;
 | 
						|
 | 
						|
            #If the interface is in ignore list
 | 
						|
            if (grep { $_ eq "ib$if_id" } @IgnoreList)
 | 
						|
            {
 | 
						|
                $ignore = 1;
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                $ignore = 0;
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check Logical Port State
 | 
						|
        if ($line =~ /\S*:\s*Logical Port State:\s*(\w+)/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($1 ne "Active")
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id Logical Port State is $1. UNEXPECTED STATE.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Logical Port State is $1. UNEXPECTED STATE.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id Logical Port State is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Logical Port State is $1.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check Physical Port State
 | 
						|
        if ($line =~ /\S*:\s*Physical Port State:\s*(\w+)/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($1 ne "Active")
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id Physical Port State is $1. UNEXPECTED STATE.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port State is $1. UNEXPECTED STATE.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id Physical Port State is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port State is $1.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check Physical Port Physical State
 | 
						|
        if ($line =~ /\S*:\s*Physical Port Physical State:\s*(\.+)/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($1 ne "Link Up")
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id Physical Port Physical State is $1. UNEXPECTED STATE.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Physical State is $1. UNEXPECTED STATE.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id Physical Port Physical State is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Physical State is $1.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check speed
 | 
						|
        if ($line =~ /\S*:\s*Physical Port Speed:\s*(.+)\s*G/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if (!defined($::HCASPEED))
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id speed is $1Gbps. \n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps.\n";
 | 
						|
            }
 | 
						|
            elsif ($1 != $::HCASPEED)
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id speed is $1Gbps. UNEXPECTED SPEED.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps. UNEXPECTED SPEED.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id speed is $1Gbps. \n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Ckeck width
 | 
						|
        if ($line =~ /\S*:\s*Physical Port Width:\s*(\w+)/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($::WIDTH eq "")
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id Physical Port Width is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1.\n";
 | 
						|
            }
 | 
						|
            elsif ($1 ne $::WIDTH)
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id Physical Port Width is $1. UNEXPECTED WIDTH.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1. UNEXPECTED WIDTH.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id Physical Port Width is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #All are normal
 | 
						|
    if (!$abnormal)
 | 
						|
    {
 | 
						|
        print "HCA status of all nodes is normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "HCA status of all nodes is normal.\n";
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        print "HCA status of all nodes contains unexpected result.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "HCA status of all nodes contains unexpected result.\n"
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
 | 
						|
=head3    HCACheckLinux
 | 
						|
 | 
						|
          Check HCA status for Linux nodes
 | 
						|
Arguments:
 | 
						|
        $refLnxNodes - The reference to the group of Linux nodes.
 | 
						|
=cut
 | 
						|
 | 
						|
#-------------------------------------------------------------------------------
 | 
						|
sub HCACheckLinux()
 | 
						|
{
 | 
						|
    my ($refLnxNodes) = @_;
 | 
						|
    my @ValidLnxNodes;
 | 
						|
    my $abnormal = 0;
 | 
						|
    my $allnodes = join ", ", @$refLnxNodes;
 | 
						|
    print "\nChecking HCA status for nodes: $allnodes...\n";
 | 
						|
    print $::LOG_FILE_HANDLE "Checking HCA status for nodes: $allnodes...\n";
 | 
						|
 | 
						|
    @ValidLnxNodes = &checkCmdAvailability($refLnxNodes, $::LinuxHCACmd);
 | 
						|
 | 
						|
    if (!scalar(@ValidLnxNodes))
 | 
						|
    {
 | 
						|
        return $::NOK;
 | 
						|
    }
 | 
						|
    my $ValidLnxNodes = join(",", @ValidLnxNodes);
 | 
						|
    my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxHCACmd -v | egrep 'ehca|port:|state:|width:|speed:'\" 2>/dev/null";
 | 
						|
    print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | 
						|
    my @output = `$checkCmd`;
 | 
						|
    if ($?)
 | 
						|
    {
 | 
						|
        print "Command failed: $checkCmd.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $host;
 | 
						|
    my $hca_id;
 | 
						|
    my $port;
 | 
						|
    my $if_id;
 | 
						|
    my $ignore = 0;
 | 
						|
    foreach my $line (@output)
 | 
						|
    {
 | 
						|
        chomp $line;
 | 
						|
 | 
						|
        #Get host name
 | 
						|
        if ($line =~ /(\S*): (.*)/)
 | 
						|
        {
 | 
						|
            $host = $1;
 | 
						|
        }
 | 
						|
 | 
						|
        #Get HCA ID
 | 
						|
        if ($line =~ /\S*:\s*hca_id:\s*ehca(\d+)/)
 | 
						|
        {
 | 
						|
            $hca_id = $1;
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Get port number
 | 
						|
        if ($line =~ /\S*:\s*port:\s*(\d+)/)
 | 
						|
        {
 | 
						|
            #Calculate interface number according to HCA ID and port number
 | 
						|
            $port  = $1;
 | 
						|
            $if_id = $hca_id * 2 + $port - 1;
 | 
						|
 | 
						|
            #If the interface is in ignore list
 | 
						|
            if (grep { $_ eq "ib$if_id" } @IgnoreList)
 | 
						|
            {
 | 
						|
                $ignore = 1;
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                $ignore = 0;
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check state
 | 
						|
        if ($line =~ /\S*:\s*state:\s*(\w+)\s*\(.*/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($1 ne "PORT_ACTIVE" && $1 ne "active")
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id state is $1. UNEXPECTED STATE.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id state is $1. UNEXPECTED STATE.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id state is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id state is $1.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check width
 | 
						|
        if ($line =~ /\S*:\s*active_width:\s*(\w+)\s*\(.*/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($::WIDTH eq "")
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id active_width is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1.\n";
 | 
						|
            }
 | 
						|
            elsif ($1 ne $::WIDTH)
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id active_width is $1. UNEXPECTED WIDTH.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1. UNEXPECTED WIDTH.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id active_width is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check speed
 | 
						|
        if ($line =~ /\S*:\s*active_speed:\s*(.+)\s*Gbps.*/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if (!defined($::HCASPEED))
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id active_speed is $1Gbps.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps.\n";
 | 
						|
            }
 | 
						|
            elsif ($1 != $::HCASPEED)
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id active_speed is $1Gbps. UNEXPECTED SPEED.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps. UNEXPECTED SPEED.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id active_speed is $1Gbps.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps.\n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
 | 
						|
        #Check Physical State
 | 
						|
        if ($line =~ /\S*:\s*phys_state:\s*(\w+)\s*\(.*/ && $ignore == 0)
 | 
						|
        {
 | 
						|
            if ($1 ne "LINK_UP")
 | 
						|
            {
 | 
						|
                $abnormal = 1;
 | 
						|
                print "$host: ib$if_id phys_state is $1. UNEXPECTED STATE.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id phys_state is $1. UNEXPECTED STATE.\n";
 | 
						|
            }
 | 
						|
            else
 | 
						|
            {
 | 
						|
                print "$host: ib$if_id phys_state is $1.\n";
 | 
						|
                print $::LOG_FILE_HANDLE "$host: ib$if_id phys_state is $1. \n";
 | 
						|
            }
 | 
						|
            next;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    #All are normal
 | 
						|
    if (!$abnormal)
 | 
						|
    {
 | 
						|
        print "HCA status of all nodes is normal.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "HCA status of all nodes is normal.\n";
 | 
						|
    }
 | 
						|
    else
 | 
						|
    {
 | 
						|
        print "HCA status of all nodes contains unexpected result.\n";
 | 
						|
        print $::LOG_FILE_HANDLE "HCA status of all nodes contains unexpected re
 | 
						|
sult.\n"
 | 
						|
    }
 | 
						|
    return $::OK;
 | 
						|
}
 |