mirror of
				https://github.com/xcat2/xcat-core.git
				synced 2025-10-26 00:45:38 +00:00 
			
		
		
		
	git-svn-id: https://svn.code.sf.net/p/xcat/code/xcat-core/trunk@3232 8638fb3e-16cb-4fca-ae20-7b5d299a9bcd
		
			
				
	
	
		
			1363 lines
		
	
	
		
			41 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
			
		
		
	
	
			1363 lines
		
	
	
		
			41 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
| #!/usr/bin/perl
 | |
| # IBM(c) 2008 EPL license http://www.eclipse.org/legal/epl-v10.html
 | |
| 
 | |
| ###########################################################################
 | |
| #                                                                         #
 | |
| # Command: healthCheck                                                    #
 | |
| #                                                                         #
 | |
| #-------------------------------------------------------------------------#
 | |
| #    This script is used to check the system health for both AIX and 
 | |
| #    Linux Managed Nodes on Power6 platforms. It will use xdsh to access
 | |
| #    the target nodes, and check the status for processor clock speed,
 | |
| #    IB interfaces, memory, large page configuration and HCA status.
 | |
| #    If xdsh is unreachable, an error message will be given. 
 | |
| # Command Syntax:
 | |
| #    healthCheck { [-n node_list] [-M]}
 | |
| #                {[-p min_clock_speed] [-i method] [-m min_memory]
 | |
| #                [-l min_freelp] [ -H [--speed speed --ignore interface_list --width width]]} 
 | |
| #                [ -h ]
 | |
| #
 | |
| #        -M          Check status for all the Managed Nodes that are defined on this MN.
 | |
| #        -n node_list
 | |
| #                    Specifies a comma-separated list of node host names, IP addresses for health check.
 | |
| #        -p min_clock_speed
 | |
| #                    Specifies the minimal processor clock speed in MHz for processor monitor.
 | |
| #        -i method
 | |
| #                    Specifies the method to do Infiniband interface status check, the supported 
 | |
| #                    check methods are LL and RSCT.
 | |
| #        -m min_memory
 | |
| #                    Specifies the minimal total memory in MB.
 | |
| #        -l min_freelp
 | |
| #                    Specifies the minimal free large page number.
 | |
| #        -H          Check the status for HCAs.
 | |
| #        --speed speed
 | |
| #                    Specifies the physical port speed in G bps, it should be used with -H flag.
 | |
| #        --ignore interface_list
 | |
| #                    Specifies a comma-separated list of interface name to ignore from HCA status check, 
 | |
| #                    such as ib0,ib1. It should be used with -H flag.
 | |
| #        --width width
 | |
| #                    Specifies the physical port width, such as 4X or 12X. It should be used with -H flag.
 | |
| #        -h          Display usage information.
 | |
| # Exit codes:
 | |
| #     0 - success
 | |
| #     1 - fail
 | |
| ###########################################################################
 | |
| 
 | |
| use strict;
 | |
| use Getopt::Long;
 | |
| 
 | |
| # Log file
 | |
| $::HEALTHCHECK_PATH = "/var/log/xcat";
 | |
| $::HEALTHCHECK_LOG = "$::HEALTHCHECK_PATH/healthCheck.log";
 | |
| 
 | |
| # variables and Commands
 | |
| $::OK = 0;
 | |
| $::NOK = 1;
 | |
| $::logging = 0;
 | |
| $::GLOBAL_EXIT = 0;
 | |
| $::NODELS = "/opt/xcat/bin/nodels";
 | |
| $::LinuxProcCmd = "cat /proc/cpuinfo";
 | |
| $::AIXProcCmd = "/usr/pmapi/tools/pmcycles";
 | |
| $::IBifLLCmdL = "/opt/ibmll/LoadL/full/bin/llstatus";
 | |
| $::IBifLLCmdA = "/usr/lpp/LoadL/full/bin/llstatus";
 | |
| $::IBifRSCTCmd = "/usr/bin/lsrsrc";
 | |
| $::AIXMemCmd = "/usr/bin/vmstat";
 | |
| $::LinuxMemCmd = "cat /proc/meminfo";
 | |
| $::AIXHCACmd = "/usr/bin/ibstat";
 | |
| $::LinuxHCACmd = "/usr/bin/ibv_devinfo";
 | |
| 
 | |
| #Nodes to be checked
 | |
| my @NodeList;
 | |
| my @LnxNodeList;
 | |
| my @AIXNodeList;
 | |
| #Ignored ib interfaces when checking HCA status
 | |
| my @IgnoreList;
 | |
| 
 | |
| # MAIN Main main#
 | |
| # Append logging information to getGuids.log
 | |
| &append_logging($::HEALTHCHECK_LOG);
 | |
| $::logging++;
 | |
| 
 | |
| &getArgs;
 | |
| 
 | |
| my @ReachableNodes;
 | |
| my @UnreachableNodes;
 | |
| if (scalar(@NodeList) > 0)
 | |
| {
 | |
|     # Check if xdsh is reachable
 | |
|     foreach my $node (@NodeList)
 | |
|     {
 | |
|         my $rc = &checkDshReachability($node);
 | |
|         if ($rc == 0)
 | |
|         { # xdsh is ok
 | |
|             push @ReachableNodes, $node;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             push @UnreachableNodes, $node;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if (scalar (@UnreachableNodes))
 | |
|     {
 | |
|         my $UnreachableNodes = join (", ", @UnreachableNodes);
 | |
|         print "Warning: The xdsh is unreachable for the node(s): $UnreachableNodes. Run updatenode to configure xdsh for that node(s).\n";
 | |
|         print $::LOG_FILE_HANDLE "Warning: The xdsh is unreachable for the node(s): $UnreachableNodes. Run updatenode to configure xdsh for that node(s).\n";
 | |
|     }
 | |
| }
 | |
| else
 | |
| {
 | |
|     print "Error: No hosts in node list\n";
 | |
|     print $::LOG_FILE_HANDLE "Error: No hosts in node list\n";
 | |
|     $::GLOBAL_EXIT = $::NOK;
 | |
|     exit;
 | |
| }
 | |
| 
 | |
| #Divide the nodes into two groups: Linux nodes and AIX nodes
 | |
| my @AllLnxNodes = `$::NODELS all nodetype.os | grep -E "sles|rhel"`;
 | |
| print $::LOG_FILE_HANDLE "Running command: $::NODELS all nodetype.os | grep -E sles|rhel\n";
 | |
| chomp @AllLnxNodes;
 | |
| my @AllAIXNodes = `$::NODELS all nodetype.os | grep "AIX"`;
 | |
| print $::LOG_FILE_HANDLE "Running command: $::NODELS all nodetype.os | grep AIX\n";
 | |
| chomp @AllAIXNodes;
 | |
| 
 | |
| foreach my $node (@ReachableNodes)
 | |
| {
 | |
|     if ( grep {$_ =~ /$node/} @AllLnxNodes )
 | |
|     {
 | |
|         push @LnxNodeList, $node;
 | |
|     }
 | |
|     if ( grep {$_ =~ /$node/} @AllAIXNodes )
 | |
|     {
 | |
|         push @AIXNodeList, $node;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #Do health check for Linux nodes and AIX nodes
 | |
| if (scalar (@LnxNodeList))
 | |
| {
 | |
|     my $rc = &healthCheckLinux(\@LnxNodeList);
 | |
|     if ($rc)
 | |
|     {
 | |
|         $::GLOBAL_EXIT = $rc;
 | |
|         exit;
 | |
|     }
 | |
| }
 | |
| if (scalar (@AIXNodeList))
 | |
| {
 | |
|     my $rc = &healthCheckAIX(\@AIXNodeList);
 | |
|     if ($rc)
 | |
|     {
 | |
|         $::GLOBAL_EXIT = $rc;
 | |
|         exit;
 | |
|     }
 | |
| }
 | |
| 
 | |
| # Finish up and exit
 | |
| END
 | |
| {
 | |
| 	if ($::logging)
 | |
|     {
 | |
| 		&stop_logging();
 | |
| 	}
 | |
| 	#Determine exit code
 | |
| 	if ($::GLOBAL_EXIT > $?)
 | |
|     {
 | |
| 		$? = $::GLOBAL_EXIT;
 | |
| 	}
 | |
| }
 | |
| exit;  # end of Main
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| 
 | |
| =head3  getArgs
 | |
|         Parse the command line and check the values
 | |
| =cut
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| sub getArgs()
 | |
| {
 | |
|     $Getopt::Long::ignorecase = 0;
 | |
|     my $rc = GetOptions(
 | |
|         'h'        => \$::HELP,
 | |
|         'M'        => \$::ALLMANAGED,
 | |
|         'n=s'      => \$::NODE_LIST,
 | |
|         'p=f'      => \$::MIN_CLOCK_SPEED,
 | |
|         'i=s'      => \$::IF_CHECK_METHOD,
 | |
|         'm=f'      => \$::MIN_MEMORY,
 | |
|         'l=i'      => \$::MIN_FREELP,
 | |
|         'H'        => \$::HCA,
 | |
|         'speed=f'  => \$::HCASPEED,
 | |
|         'ignore=s' => \$::IGNORE_LIST,
 | |
|         'width=s'  => \$::WIDTH
 | |
|     );
 | |
|     if ($::HELP)
 | |
|     {
 | |
|         &usage();
 | |
|         $::GLOBAL_EXIT = $::OK;
 | |
|         exit;
 | |
|     }
 | |
|     if (!$rc)
 | |
|     { #There are syntax errors in parameters
 | |
|         &usage();
 | |
|         $::GLOBAL_EXIT = $::NOK;
 | |
|         exit;
 | |
|     }
 | |
|     if ($::IF_CHECK_METHOD)
 | |
|     {
 | |
|         if ($::IF_CHECK_METHOD ne "LL" && $::IF_CHECK_METHOD ne "RSCT")
 | |
|         {
 | |
|             print "The method to check IB interface must be \"LL\" or \"RSCT\".\n";
 | |
|             &usage();
 | |
|             $::GLOBAL_EXIT = $::NOK;
 | |
|             exit;
 | |
|         }
 | |
|     }
 | |
|     if(!defined($::MIN_CLOCK_SPEED) && !defined($::IF_CHECK_METHOD) && !defined($::MIN_MEMORY) && !defined($::MIN_FREELP) && !defined($::HCA))
 | |
|     {
 | |
|         print "There is nothing to check. Please specify what you want to check.\n";
 | |
|         &usage();
 | |
|         $::GLOBAL_EXIT = $::NOK;
 | |
|         exit;
 | |
|     }
 | |
|     
 | |
|     if(!defined($::HCA) && (defined($::HCASPEED) || $::IGNORE_LIST ne "" || $::WIDTH ne ""))
 | |
|     {
 | |
|         print "The flag --speed, --ignore or --width should be used with -H flag.\n";
 | |
|         &usage();
 | |
|         $::GLOBAL_EXIT = $::NOK;
 | |
|         exit;
 | |
|     }
 | |
|     
 | |
|     if($::WIDTH ne "")
 | |
|     {
 | |
|         if(!($::WIDTH =~ /^(\d+)X$/))
 | |
|         {
 | |
|             print "The parameter of --width should be 4X, 12X, etc.\n";
 | |
|             &usage();
 | |
|             $::GLOBAL_EXIT = $::NOK;
 | |
|             exit;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #Parse the node list
 | |
|     if ($::NODE_LIST)
 | |
|     {    
 | |
|         @NodeList = `$::NODELS $::NODE_LIST`; chomp @NodeList;
 | |
|         if ($?) {
 | |
|             print $::LOG_FILE_HANDLE  "Running command: $::NODELS $::NODE_LIST fail\n";
 | |
|             exit;
 | |
|         } else {
 | |
|             print $::LOG_FILE_HANDLE  "Running command: $::NODELS $::NODE_LIST\n";
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     #If -M is set, list all nodes in Managed mode
 | |
|     if ($::ALLMANAGED)
 | |
|     {
 | |
|         my @templist = `$::NODELS`;
 | |
|         chomp @templist;
 | |
|         print $::LOG_FILE_HANDLE  "Running command: lsnode -w \"Mode='Managed' or Mode='MinManaged'\"\n";
 | |
|         @NodeList = @templist;
 | |
|     }
 | |
|     if ($::IGNORE_LIST)
 | |
|     {
 | |
|         @IgnoreList = split /,/, $::IGNORE_LIST;
 | |
|     }
 | |
| }
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| 
 | |
| =head3  usage
 | |
| 
 | |
|         usage for healthCheck
 | |
| =cut
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| sub usage()
 | |
| {
 | |
|     print
 | |
| "Usage: healthCheck { [-n node_list] [-M]}
 | |
|                     {[-p min_clock_speed] [-i method] [-m min_memory]
 | |
|                     [-l min_freelp] [ -H [--speed speed --ignore interface_list --width width]]} 
 | |
|                     [ -h ]
 | |
| 
 | |
|         -M          Check status for all the Managed Nodes that are defined on this MS.
 | |
|         -n node_list
 | |
|                     Specifies a comma-separated list of node host names, IP addresses for health check.
 | |
|         -p min_clock_speed
 | |
|                     Specifies the minimal processor clock speed in MHz for processor monitor.
 | |
|         -i method
 | |
|                     Specifies the method to do Infiniband interface status check, the supported 
 | |
|                     check methods are LL and RSCT.
 | |
|         -m min_memory
 | |
|                     Specifies the minimal total memory in MB.
 | |
|         -l min_freelp
 | |
|                     Specifies the minimal free large page number.
 | |
|         -H          Check the status for HCAs.
 | |
|         --speed speed
 | |
|                     Specifies the physical port speed in G bps, it should be used with -H flag.
 | |
|         --ignore interface_list
 | |
|                     Specifies a comma-separated list of interface name to ignore from HCA status check, 
 | |
|                     such as ib0,ib1. It should be used with -H flag.
 | |
|         --width width
 | |
|                     Specifies the physical port width, such as 4X or 12X. It should be used with -H flag.
 | |
|         -h          Display usage information.
 | |
| ";         
 | |
| }
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| 
 | |
| =head3   checkDshReachability
 | |
| 
 | |
|         Notes: Check the xdsh reachability between the Management Server
 | |
|                and node.
 | |
| 
 | |
| Arguments:
 | |
|         $node - the remote node hostname.
 | |
| 
 | |
| Returns:
 | |
|         $::OK - The remote node is reachable through xdsh.
 | |
|         $::NOK - The remote node is unreachable through xdsh.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| sub checkDshReachability()
 | |
| {
 | |
|     my ($node) = @_;
 | |
|     my $output = `xdsh $node date 2>/dev/null`;
 | |
|     print $::LOG_FILE_HANDLE  "Running command: xdsh $node date 2>/dev/null\n";
 | |
|     if ($? == $::OK)
 | |
|     {
 | |
|         return $::OK;
 | |
|     }
 | |
|     return $::NOK;
 | |
| }
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| 
 | |
| =head3   checkCmdAvailability
 | |
| 
 | |
|         Check availability of some command on the nodes.
 | |
| 
 | |
| Arguments:
 | |
|         $refNodes - The reference to a group of nodes.
 | |
|         $cmd - The command that will be checked.
 | |
| Returns:
 | |
|         @ValidNodes - A group of nodes that the command is available on these nodes.
 | |
| =cut
 | |
| 
 | |
| #--------------------------------------------------------------------------------
 | |
| sub checkCmdAvailability
 | |
| {
 | |
|     my ($refNodes, $cmd) = @_;
 | |
|     my @ValidNodes;
 | |
|     my @BadNodes;
 | |
|     foreach my $node (@$refNodes)
 | |
|     {
 | |
|         #Use xdsh to "ls" the command 
 | |
|         `xdsh $node ls $cmd 2>/dev/null`;
 | |
|         print $::LOG_FILE_HANDLE "Running command: xdsh $node ls $cmd 2>/dev/null\n";
 | |
|         if ($? == $::OK)
 | |
|         {
 | |
|             push @ValidNodes, $node;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             push @BadNodes, $node;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if (scalar (@BadNodes))
 | |
|     {
 | |
|         my $BadNodes = join (", ", @BadNodes);
 | |
|         print "Warning: The command $cmd is not available on the node(s): $BadNodes.\n";
 | |
|         print $::LOG_FILE_HANDLE "Warning: The command $cmd is not available on the node(s): $BadNodes.\n";
 | |
|     }
 | |
|     return @ValidNodes;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    append_logging
 | |
| 
 | |
| 	Append logging messages to a logfile.
 | |
| 
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub append_logging()
 | |
| {
 | |
|     my ($logfile) = @_;
 | |
|     my ($cmd,  $rc);
 | |
| 
 | |
|     #  get log file ready
 | |
|     if (!-e $logfile)
 | |
|     {
 | |
|         `mkdir -p $::HEALTHCHECK_PATH`;
 | |
|         #  create the log file if not already there
 | |
|         unless (open(LOGFILE, ">$logfile"))
 | |
|         {
 | |
|             # Cannot open file
 | |
|             print "Can't open file \"$logfile\" for writing.\n";
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         # it's there so just append
 | |
|         unless (open(LOGFILE, ">>$logfile"))
 | |
|         {
 | |
|             print "Can't update file  \"$logfile\".\n";
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     $::LOG_FILE_HANDLE = \*LOGFILE;
 | |
| 
 | |
|     # Print the date to the top of the logfile
 | |
|     my $sdate = `/bin/date`;
 | |
|     chomp $sdate;
 | |
|     print "Output log is being written to \"$logfile\".\n";
 | |
| 
 | |
|     print $::LOG_FILE_HANDLE
 | |
|         "---------------------------------------------------------------------\n";
 | |
|     print $::LOG_FILE_HANDLE "Logging started $sdate.\n";
 | |
|     print $::LOG_FILE_HANDLE
 | |
|         "---------------------------------------------------------------------\n";
 | |
| 
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    stop_logging
 | |
| 
 | |
| 	Turn off message logging.
 | |
| 	
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub stop_logging()
 | |
| {
 | |
|     # Print the date at the bottom of the logfile
 | |
|     my $sdate = `/bin/date`;
 | |
|     chomp $sdate;
 | |
|     print $::LOG_FILE_HANDLE
 | |
|         "---------------------------------------------------------------------\n";
 | |
|     print $::LOG_FILE_HANDLE "Logging stopped $sdate.\n";
 | |
|     print $::LOG_FILE_HANDLE
 | |
|         "---------------------------------------------------------------------\n";
 | |
| 
 | |
|     close($::LOG_FILE_HANDLE);
 | |
|     $::LOG_FILE_HANDLE = undef;
 | |
| 
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    healthCheckLinux
 | |
| 
 | |
|           Check Linux nodes
 | |
| Arguments:
 | |
|         $refLnxNodes - The reference to the group of Linux nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub healthCheckLinux()
 | |
| {
 | |
|     my ($refLnxNodes) = @_;
 | |
|     my $allnodes = join ", ", @$refLnxNodes;
 | |
|     print "Checking health for Linux nodes: $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking health for Linux nodes: $allnodes...\n";    
 | |
|     my $rc;
 | |
|     if ( defined($::MIN_CLOCK_SPEED) )
 | |
|     {
 | |
|         $rc = procsrCheckLinux($refLnxNodes);
 | |
|         if ($rc != $::OK) {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     if ( defined($::MIN_MEMORY) || defined($::MIN_FREELP) )
 | |
|     {
 | |
|         $rc = memCheckLinux($refLnxNodes);
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     if ( $::IF_CHECK_METHOD ne "" )
 | |
|     {
 | |
|         $rc = ifCheck($refLnxNodes, 'Linux');
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     if ( defined($::HCA) )
 | |
|     {
 | |
|         $rc = HCACheckLinux($refLnxNodes);
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    healthCheckAIX
 | |
| 
 | |
|           Check AIX nodes
 | |
| Arguments:
 | |
|         $refAIXNodes - The reference to the group of AIX nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub healthCheckAIX()
 | |
| {
 | |
|     my ($refAIXNodes) = @_;
 | |
|     my $allnodes = join ", ", @$refAIXNodes;
 | |
|     print "Checking health for AIX nodes: $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking health for AIX nodes: $allnodes...\n";
 | |
|     
 | |
|     my $rc;
 | |
|     if ( defined($::MIN_CLOCK_SPEED) )
 | |
|     {
 | |
|         $rc = procsrCheckAIX($refAIXNodes);
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     if ( defined($::MIN_MEMORY) || defined($::MIN_FREELP) )
 | |
|     {
 | |
|         $rc = memCheckAIX($refAIXNodes);
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     if ( $::IF_CHECK_METHOD ne "" )
 | |
|     {
 | |
|         $rc = ifCheck($refAIXNodes, 'AIX');
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     if ( defined($::HCA) )
 | |
|     {
 | |
|         $rc = HCACheckAIX($refAIXNodes);
 | |
|         if ($rc != $::OK)
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    procsrCheckLinux
 | |
| 
 | |
|           Check processor clock speed for Linux nodes
 | |
| Arguments:
 | |
|         $refLnxNodes - The reference to the group of Linux nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub procsrCheckLinux()
 | |
| {
 | |
|     my ($refLnxNodes) = @_;
 | |
|     my $ValidLnxNodes = join (",", @$refLnxNodes);
 | |
| 
 | |
|     print "\nChecking processor clock speed for nodes: $ValidLnxNodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking processor clock speed for nodes: $ValidLnxNodes...\n";
 | |
| 
 | |
|     my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxProcCmd\" 2>/dev/null";
 | |
|     print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|     my @output = `$checkCmd`;
 | |
|     if ($?)
 | |
|     {
 | |
|         print "Command failed: $checkCmd.\n";
 | |
|         print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     my $host;
 | |
|     my $pcrnum;
 | |
|     my $clspeed;
 | |
|     my %result;
 | |
|     foreach my $line (@output)
 | |
|     {
 | |
|         chomp $line;
 | |
|         # Get node hostname
 | |
|         if ($line =~ /(\S*):.*/)
 | |
|         {
 | |
|             $host = $1;
 | |
|         }
 | |
|         if ($line =~ /\S+: processor\s*: (\d+)/)
 | |
|         {
 | |
|             #Get processor number
 | |
|             $pcrnum = $1;
 | |
|         }
 | |
|         if ($line =~ /\S+: clock\s*: (.*)MHz/)
 | |
|         {
 | |
|             #Get processor clock speed
 | |
|             $clspeed = $1;
 | |
|             if ($clspeed < $::MIN_CLOCK_SPEED)
 | |
|             {
 | |
|                 $result{$host} = "$result{$host}Processor $pcrnum clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     if (%result)
 | |
|     {
 | |
|         foreach my $h (sort keys %result)
 | |
|         {
 | |
|             print "$h:\n$result{$h}";
 | |
|             print $::LOG_FILE_HANDLE "$h:\n$result{$h}";
 | |
|         }
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         print "The processor clock speed of all nodes is normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "The processor clock speed of all nodes is normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    procsrCheckAIX
 | |
| 
 | |
| 	      Check processor clock speed for AIX nodes
 | |
| Arguments:
 | |
|         $refAIXNodes - The reference to the group of AIX nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub procsrCheckAIX()
 | |
| {
 | |
|     my ($refAIXNodes) = @_;
 | |
|     my @ValidAIXNodes;
 | |
|     my $abnormal = 0;
 | |
|     my $allnodes = join ", ", @$refAIXNodes;
 | |
|     print "\nChecking processor clock speed for nodes: $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking processor clock speed for nodes: $allnodes...\n";
 | |
|     
 | |
|     @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXProcCmd);
 | |
| 
 | |
|     if (!scalar (@ValidAIXNodes))
 | |
|     {
 | |
|         return $::NOK;
 | |
|     }
 | |
|     my $ValidAIXNodes = join (",", @ValidAIXNodes);
 | |
|     my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXProcCmd -M\" 2>/dev/null";
 | |
|     print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|     my @output = `$checkCmd`;
 | |
|     if ($?)
 | |
|     {
 | |
|         print "Command failed: $checkCmd.\n";
 | |
|         print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     my $host;
 | |
|     my $pcrnum;
 | |
|     my $clspeed;
 | |
|     foreach my $line (@output)
 | |
|     {
 | |
|         chomp $line;
 | |
|         # Get node hostname
 | |
|         if ($line =~ /(\S*):.*/)
 | |
|         {
 | |
|             $host = $1;
 | |
|         }
 | |
|         if ($line =~ /\S+:\s*This machine runs at (\d+) MHz/)
 | |
|         {
 | |
|             #Get processor clock speed
 | |
|             $clspeed = $1;
 | |
|             if ($clspeed < $::MIN_CLOCK_SPEED)
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host processor clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host processor clock speed is $clspeed MHz. UNEXPECTED LOW FREQUENCY.\n";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     if(!$abnormal)
 | |
|     {
 | |
|         print "The processor clock speed of all nodes is normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "The processor clock speed of all nodes is normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    ifCheck
 | |
| 
 | |
|           Check ib interfaces status
 | |
| Arguments:
 | |
|         $refNodes - The reference to all nodes.
 | |
|         $os - Linux or AIX
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub ifCheck()
 | |
| {
 | |
|     my ($refNodes, $os) = @_;
 | |
|     my @ValidNodes;
 | |
|     my $cmd;
 | |
|     my $abnormal = 0;
 | |
|     my $allnodes = join ", ", @$refNodes;
 | |
|     
 | |
|     if ($::IF_CHECK_METHOD eq "LL")
 | |
|     {
 | |
|         if ($os eq 'Linux')
 | |
|         {
 | |
|             $cmd = $::IBifLLCmdL;
 | |
|         }
 | |
|         elsif ($os eq 'AIX')
 | |
|         {
 | |
|             $cmd = $::IBifLLCmdA;
 | |
|         }
 | |
|         else
 | |
|         {
 | |
|             return $::NOK;
 | |
|         }
 | |
|     }
 | |
|     elsif ($::IF_CHECK_METHOD eq "RSCT")
 | |
|     {
 | |
|         $cmd = $::IBifRSCTCmd;
 | |
|     }
 | |
|     else
 | |
|     {
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     print "\nChecking IB interface status using command $cmd for nodes: $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking IB interface status using command $cmd for nodes: $allnodes...\n";
 | |
|     
 | |
|     @ValidNodes = &checkCmdAvailability($refNodes, $cmd);
 | |
|     
 | |
|     if (!scalar (@ValidNodes))
 | |
|     {
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     my $ValidNodes = join (",", @ValidNodes);
 | |
|     
 | |
|     #Method is "LL"
 | |
|     if ($::IF_CHECK_METHOD eq "LL")
 | |
|     {
 | |
|         #Run the LL command on only one node, this is enough because it will return status of all nodes
 | |
|         my $checkCmd = "xdsh $ValidNodes[0] \"$cmd -a\" 2>/dev/null";
 | |
|         print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|         my @output = `$checkCmd`;
 | |
|         if ($?)
 | |
|         {
 | |
|             print "Command failed: $checkCmd.\n";
 | |
|             print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|             return $::NOK;
 | |
|         }
 | |
|         
 | |
|         #Record whether the next line is the beginning of a new section
 | |
|         my $armed = 0;
 | |
|         my $ibnum;
 | |
|         my $host;
 | |
|         #Record abnormal ib interfaces of a node
 | |
|         my @ib_array = ();
 | |
|         foreach my $line (@output)
 | |
|         {
 | |
|             chomp $line;
 | |
|             #The text returned by llstatus is seperared into several sections by "=================="
 | |
|             if ($line =~ /==================/)
 | |
|             {
 | |
|                 #If there are abnormal interfaces found after check last section
 | |
|                 if ( @ib_array )
 | |
|                 {
 | |
|                     print "$host IB interface(s) down: ";
 | |
|                     print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
 | |
|                     foreach (sort @ib_array)
 | |
|                     { 
 | |
|                         print "ib$_ " ;
 | |
|                         print $::LOG_FILE_HANDLE "ib$_ " ;
 | |
|                     }
 | |
|                     print "\n";
 | |
|                     print $::LOG_FILE_HANDLE "\n";
 | |
|                 }
 | |
|                 @ib_array = ();
 | |
|                 $armed = 1;
 | |
|             }
 | |
|             elsif ( ( $armed ) && ($line =~ /\S+?:\s*(\S+)/) )
 | |
|             { #The first line of a new section
 | |
|                 #Get node hostname
 | |
|                 $host = $1;
 | |
|                 $armed = 0;
 | |
|             }
 | |
|             elsif ( $line =~ /\S+?:\s*ib(\d+).*/ )
 | |
|             {
 | |
|                 #Get interface number
 | |
|                 $ibnum = $1;
 | |
|                 #Check the status
 | |
|                 if ( ! ($line =~ /READY/) )
 | |
|                 {
 | |
|                     $abnormal = 1;                
 | |
|                     push @ib_array, $ibnum;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         #If there are abnormal interfaces found after check the last section
 | |
|         if ( @ib_array )
 | |
|         {
 | |
|             print "$host IB interface(s) down: ";
 | |
|             print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
 | |
|             foreach (sort @ib_array)
 | |
|             { 
 | |
|                 print "ib$_ " ;
 | |
|                 print $::LOG_FILE_HANDLE "ib$_ " ;
 | |
|             }
 | |
|             print "\n";
 | |
|             print $::LOG_FILE_HANDLE "\n";
 | |
|         }
 | |
|     }
 | |
|     #Method is "RSCT"
 | |
|     elsif ($::IF_CHECK_METHOD eq "RSCT")
 | |
|     {
 | |
|         my $checkCmd = "xdsh $ValidNodes \"$::IBifRSCTCmd IBM.NetworkInterface Name OpState\" 2>/dev/null";
 | |
|         print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|         my @output = `$checkCmd`;
 | |
|         if ($?)
 | |
|         {
 | |
|             print "Command failed: $checkCmd.\n";
 | |
|             print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|             return $::NOK;
 | |
|         }
 | |
|         my $host;
 | |
|         my $prevhost;
 | |
|         my $ifname;
 | |
|         my $opstate;
 | |
|         my @ib_array = ();
 | |
|         foreach my $line (@output)
 | |
|         {
 | |
|             chomp $line;
 | |
|             if ($line =~ /(\S*):.*/)
 | |
|             {
 | |
|                 #Get node hostname
 | |
|                 $host = $1;
 | |
|                 #If the host is not equal prevhost, it indicates this is beginning of a section for another node.
 | |
|                 #If there are abnormal interfaces found after check last node, print them.
 | |
|                 if (($host ne $prevhost) && @ib_array)
 | |
|                 {
 | |
|                     print "$prevhost IB interface(s) down: ";
 | |
|                     print $::LOG_FILE_HANDLE "$prevhost IB interface(s) down: ";
 | |
|                     foreach (sort @ib_array)
 | |
|                     { 
 | |
|                         print "$_ " ;
 | |
|                         print $::LOG_FILE_HANDLE "$_ " ;
 | |
|                     }
 | |
|                     print "\n";
 | |
|                     print $::LOG_FILE_HANDLE "\n";
 | |
|                     @ib_array = ();
 | |
|                 }
 | |
|                 $prevhost = $host;                
 | |
|             }
 | |
|             if ($line =~ /\S*:\s*Name\s*=\s*\"(.*)\"/)
 | |
|             {
 | |
|                 #Get interface name
 | |
|                 $ifname = $1;
 | |
|             }
 | |
|             if ($line =~ /\S+:\s*OpState\s*=\s*(\d+)/)
 | |
|             {
 | |
|                 #Check the status
 | |
|                 $opstate = $1;
 | |
|                 if ($opstate != 1 && $ifname =~ /ib/)
 | |
|                 {
 | |
|                     $abnormal = 1;
 | |
|                     push @ib_array, $ifname;
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
|         #If there are abnormal interfaces found after check the last node, print them.
 | |
|         if (@ib_array)
 | |
|         {
 | |
|             print "$host IB interface(s) down: ";
 | |
|             print $::LOG_FILE_HANDLE "$host IB interface(s) down: ";
 | |
|             foreach (sort @ib_array)
 | |
|             { 
 | |
|                 print "$_ " ;
 | |
|                 print $::LOG_FILE_HANDLE "$_ " ;
 | |
|             }
 | |
|             print "\n";
 | |
|             print $::LOG_FILE_HANDLE "\n";
 | |
|         }
 | |
|     }
 | |
|     if(!$abnormal)
 | |
|     {
 | |
|         print "IB interfaces of all nodes are normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "IB interfaces of all nodes are normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    memCheckLinux
 | |
| 
 | |
|           Check memory and free large page for Linux nodes
 | |
| Arguments:
 | |
|         $refLnxNodes - The reference to the group of Linux nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub memCheckLinux()
 | |
| {
 | |
|     my ($refLnxNodes) = @_;
 | |
|     my $abnormalmem = 0;
 | |
|     my $abnormalflp = 0;
 | |
|     my $ValidLnxNodes = join (",", @$refLnxNodes);
 | |
| 
 | |
|     print "\nChecking memory for nodes: $ValidLnxNodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking memory for nodes: $ValidLnxNodes...\n";
 | |
| 
 | |
|     my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxMemCmd\" 2>/dev/null";
 | |
|     print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|     my @output = `$checkCmd`;
 | |
|     if ($?)
 | |
|     {
 | |
|         print "Command failed: $checkCmd.\n";
 | |
|         print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     my $host;
 | |
|     my $mem;
 | |
|     my $freelp;
 | |
|     foreach my $line (@output)
 | |
|     {
 | |
|         chomp $line;
 | |
|         if ($line =~ /(\S*): MemTotal:\s*(\d+) kB/ && defined($::MIN_MEMORY))
 | |
|         {
 | |
|             # Get node hostname and memory size
 | |
|             $host = $1;
 | |
|             $mem = $2;
 | |
|             if ($mem < ($::MIN_MEMORY * 1000))
 | |
|             {
 | |
|                 $abnormalmem = 1;
 | |
|                 my $mem2 = $mem / 1000;
 | |
|                 print "$host total memory is $mem2 MB. UNEXPECTED small size.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host total memory is $mem kB. UNEXPECTED small size.\n";
 | |
|             }
 | |
|         }
 | |
|         if ($line =~ /(\S*): HugePages_Free:\s*(\d+)/ && defined($::MIN_FREELP))
 | |
|         {
 | |
|             # Get node hostname and number of free large page
 | |
|             $host = $1;
 | |
|             $freelp = $2;
 | |
|             if ($freelp < $::MIN_FREELP)
 | |
|             {
 | |
|                 $abnormalflp = 1;
 | |
|                 print "$host Free large page is $freelp. UNEXPECTED small size.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host Free large page is $freelp. UNEXPECTED small size.\n";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     if(!$abnormalmem && defined($::MIN_MEMORY))
 | |
|     {
 | |
|         print "Memory size of all nodes are normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "Memory size of all nodes are normal.\n"
 | |
|     }
 | |
|     if(!$abnormalflp && defined($::MIN_FREELP))
 | |
|     {
 | |
|         print "Free large page number of all nodes are normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "Free large page number of all nodes are normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    memCheckAIX
 | |
| 
 | |
|           Check memory and free large page for AIX nodes
 | |
| Arguments:
 | |
|         $refAIXNodes - The reference to the group of AIX nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub memCheckAIX()
 | |
| {
 | |
|     my ($refAIXNodes) = @_;
 | |
|     my $abnormalmem = 0;
 | |
|     my $abnormalflp = 0;
 | |
|     my @ValidAIXNodes;
 | |
|     my $allnodes = join ", ", @$refAIXNodes;
 | |
|     print "\nChecking memory for nodes $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking memory for nodes $allnodes...\n";
 | |
|     
 | |
|     @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXMemCmd);
 | |
|     
 | |
|     if (!scalar (@ValidAIXNodes))
 | |
|     {
 | |
|         return $::NOK;
 | |
|     }
 | |
|     my $ValidAIXNodes = join (",", @ValidAIXNodes);
 | |
|     my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXMemCmd -l\" 2>/dev/null";
 | |
| 
 | |
|     print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|     my @output = `$checkCmd`;
 | |
|     if ($?)
 | |
|     {
 | |
|         print "Command failed: $checkCmd.\n";
 | |
|         print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     my $host;
 | |
|     my $mem;
 | |
|     my $freelp;
 | |
|     foreach my $line (@output)
 | |
|     {
 | |
|         chomp $line; 
 | |
|         if ($line =~ /(\S*):\s+System Configuration: lcpu=(\d+)\s+mem=(\d+)\s*MB/ && defined($::MIN_MEMORY))
 | |
|         {
 | |
|             # Get node hostname and memory size
 | |
|             $host = $1;
 | |
|             $mem = $3;
 | |
|             if ($mem < $::MIN_MEMORY)
 | |
|             {
 | |
|                 $abnormalmem = 1;
 | |
|                 print "$host total memory is $mem MB. UNEXPECTED small size\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host total memory is $mem MB. UNEXPECTED small size\n";
 | |
|             }
 | |
|         }
 | |
|         if ($line =~ /(\S*):\s*\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)/ && defined($::MIN_FREELP))
 | |
|         {
 | |
|             # Get node hostname and number of free large page
 | |
|             $host = $1;
 | |
|             $freelp = $2;
 | |
|             if ($freelp < $::MIN_FREELP)
 | |
|             {
 | |
|                 $abnormalflp = 1;
 | |
|                 print "$host Free large page is $freelp. UNEXPECTED small size\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host Free large page is $freelp. UNEXPECTED small size\n";
 | |
|             }
 | |
|         }
 | |
|     }
 | |
|     if(!$abnormalmem && defined($::MIN_MEMORY))
 | |
|     {
 | |
|         print "Memory size of all nodes are normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "Memory size of all nodes are normal.\n"
 | |
|     }
 | |
|     if(!$abnormalflp && defined($::MIN_FREELP))
 | |
|     {
 | |
|         print "Free large page number of all nodes is normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "Free large page number of all nodes is normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    HCACheckAIX
 | |
| 
 | |
|           Check HCA status for AIX nodes
 | |
| Arguments:
 | |
|         $refAIXNodes - The reference to the group of AIX nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub HCACheckAIX()
 | |
| {
 | |
|     my ($refAIXNodes) = @_;
 | |
|     my @ValidAIXNodes;
 | |
|     my $abnormal = 0;
 | |
|     my $allnodes = join ", ", @$refAIXNodes;
 | |
|     print "\nChecking HCA status for nodes $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking HCA status for nodes $allnodes...\n";
 | |
|     
 | |
|     @ValidAIXNodes = &checkCmdAvailability($refAIXNodes, $::AIXHCACmd);
 | |
| 
 | |
|     if (!scalar (@ValidAIXNodes))
 | |
|     {
 | |
|         return $::NOK;
 | |
|     }
 | |
|     my $ValidAIXNodes = join (",", @ValidAIXNodes);
 | |
|     my $checkCmd = "xdsh $ValidAIXNodes \"$::AIXHCACmd -v | egrep 'IB PORT.*INFO|Port State:|Physical Port'\" 2>/dev/null";
 | |
|     print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|     my @output = `$checkCmd`;
 | |
|     if ($?)
 | |
|     {
 | |
|         print "Command failed: $checkCmd.\n";
 | |
|         print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     #If --speed or --width is not set, this two hashes is used to record the speed and width of a interface.
 | |
|     my %speedhash;
 | |
|     my %widthhash;
 | |
|     my $host;
 | |
|     my $hca_id;
 | |
|     my $port;
 | |
|     my $if_id;
 | |
|     my $ignore = 0;
 | |
|     foreach my $line (@output)
 | |
|     {
 | |
|         chomp $line;
 | |
|         #Get host name
 | |
|         if ($line =~ /(\S*): (.*)/)
 | |
|         {
 | |
|             $host = $1;
 | |
|         }
 | |
|         #Get HCA ID and port number
 | |
|         if ($line =~ /\S*:\s*IB PORT (\d+) INFORMATION \(iba(\d+)\)/)
 | |
|         {
 | |
|             $port = $1;
 | |
|             $hca_id = $2;
 | |
|             #Calculate interface number according to HCA ID and port number
 | |
|             $if_id = $hca_id * 2 + $port - 1;
 | |
|             #If the interface is in ignore list
 | |
|             if ( grep {$_ eq "ib$if_id"} @IgnoreList )
 | |
|             {
 | |
|                 $ignore = 1;
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 $ignore = 0;
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check Logical Port State
 | |
|         if ($line =~ /\S*:\s*Logical Port State:\s*(\w+)/ && $ignore == 0)
 | |
|         {
 | |
|             if ($1 ne "Active")
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id Logical Port State is $1. UNEXPECTED STATE.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id Logical Port State is $1. UNEXPECTED STATE.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check Physical Port State
 | |
|         if ($line =~ /\S*:\s*Physical Port State:\s*(\w+)/ && $ignore == 0)
 | |
|         {
 | |
|             if ($1 ne "Active")
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id Physical Port State is $1. UNEXPECTED STATE.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port State is $1. UNEXPECTED STATE.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check Physical Port Physical State
 | |
|         if ($line =~ /\S*:\s*Physical Port Physical State:\s*(\.+)/ && $ignore == 0)
 | |
|         {
 | |
|             if ($1 ne "Link Up")
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id Physical Port Physical State is $1. UNEXPECTED STATE.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Physical State is $1. UNEXPECTED STATE.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check speed
 | |
|         if ($line =~ /\S*:\s*Physical Port Speed:\s*(.+)\s*G/ && $ignore == 0)
 | |
|         {
 | |
|             if (!defined($::HCASPEED))
 | |
|             {
 | |
|                 $speedhash{"$host: ib$if_id"} = $1; 
 | |
|             }
 | |
|             elsif ($1 != $::HCASPEED)
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id speed is $1Gbps. UNEXPECTED SPEED.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id speed is $1Gbps. UNEXPECTED SPEED.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Ckeck width
 | |
|         if ($line =~ /\S*:\s*Physical Port Width:\s*(\w+)/ && $ignore == 0)
 | |
|         {
 | |
|             if ($::WIDTH eq "")
 | |
|             {
 | |
|                 $widthhash{"$host: ib$if_id"} = $1;
 | |
|             }
 | |
|             elsif ($1 ne $::WIDTH)
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id Physical Port Width is $1. UNEXPECTED WIDTH.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id Physical Port Width is $1. UNEXPECTED WIDTH.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|     }
 | |
|     foreach my $key (sort keys %speedhash)
 | |
|     {
 | |
|         print "$key: Physical Port Speed: $speedhash{$key}G\n";
 | |
|         print $::LOG_FILE_HANDLE "$key: Physical Port Speed: $speedhash{$key}G\n";
 | |
|     }
 | |
|     foreach my $key (sort keys %widthhash)
 | |
|     {
 | |
|         print "$key: Physical Port Width: $widthhash{$key}\n";
 | |
|         print $::LOG_FILE_HANDLE "$key: Physical Port Width: $widthhash{$key}\n";
 | |
|     }
 | |
|     #All are normal
 | |
|     if(!$abnormal)
 | |
|     {
 | |
|         print "HCA status of all nodes is normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "HCA status of all nodes is normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| 
 | |
| =head3    HCACheckLinux
 | |
| 
 | |
|           Check HCA status for Linux nodes
 | |
| Arguments:
 | |
|         $refLnxNodes - The reference to the group of Linux nodes.
 | |
| =cut
 | |
| 
 | |
| #-------------------------------------------------------------------------------
 | |
| sub HCACheckLinux()
 | |
| {
 | |
|     my ($refLnxNodes) = @_;
 | |
|     my @ValidLnxNodes;
 | |
|     my $abnormal = 0;
 | |
|     my $allnodes = join ", ", @$refLnxNodes;
 | |
|     print "\nChecking HCA status for nodes: $allnodes...\n";
 | |
|     print $::LOG_FILE_HANDLE "Checking HCA status for nodes: $allnodes...\n";
 | |
|     
 | |
|     @ValidLnxNodes = &checkCmdAvailability($refLnxNodes, $::LinuxHCACmd);
 | |
| 
 | |
|     if (!scalar (@ValidLnxNodes))
 | |
|     {
 | |
|         return $::NOK;
 | |
|     }
 | |
|     my $ValidLnxNodes = join (",", @ValidLnxNodes);
 | |
|     my $checkCmd = "xdsh $ValidLnxNodes \"$::LinuxHCACmd -v | egrep 'ehca|port:|state:|width:|speed:'\" 2>/dev/null";
 | |
|     print $::LOG_FILE_HANDLE "Running command: $checkCmd.\n";
 | |
|     my @output = `$checkCmd`;
 | |
|     if ($?)
 | |
|     {
 | |
|         print "Command failed: $checkCmd.\n";
 | |
|         print $::LOG_FILE_HANDLE "Command failed: $checkCmd.\n";
 | |
|         return $::NOK;
 | |
|     }
 | |
|     
 | |
|     #If --speed or --width is not set, this two hashes is used to record the speed and width of a interface.
 | |
|     my %speedhash;
 | |
|     my %widthhash;
 | |
|     my $host;
 | |
|     my $hca_id;
 | |
|     my $port;
 | |
|     my $if_id;
 | |
|     my $ignore = 0;
 | |
|     foreach my $line (@output)
 | |
|     {
 | |
|         chomp $line;
 | |
|         #Get host name
 | |
|         if ($line =~ /(\S*): (.*)/)
 | |
|         {
 | |
|             $host = $1;
 | |
|         }
 | |
|         #Get HCA ID
 | |
|         if ($line =~ /\S*:\s*hca_id:\s*ehca(\d+)/)
 | |
|         {
 | |
|             $hca_id = $1;
 | |
|             next;
 | |
|         }
 | |
|         #Get port number
 | |
|         if ($line =~ /\S*:\s*port:\s*(\d+)/)
 | |
|         {
 | |
|             #Calculate interface number according to HCA ID and port number
 | |
|             $port = $1;
 | |
|             $if_id = $hca_id * 2 + $port - 1;
 | |
|             #If the interface is in ignore list
 | |
|             if ( grep {$_ eq "ib$if_id"} @IgnoreList )
 | |
|             {
 | |
|                 $ignore = 1;
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 $ignore = 0;
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check state
 | |
|         if ($line =~ /\S*:\s*state:\s*(\w+)\s*\(.*/ && $ignore == 0)
 | |
|         {
 | |
|             if ($1 ne "PORT_ACTIVE")
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id state is $1. UNEXPECTED STATE.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id state is $1. UNEXPECTED STATE.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Chenk width
 | |
|         if ($line =~ /\S*:\s*active_width:\s*(\w+)\s*\(.*/ && $ignore == 0)
 | |
|         {
 | |
|             if ($::WIDTH eq "")
 | |
|             {
 | |
|                 $widthhash{"$host: ib$if_id"} = $1;
 | |
|             }
 | |
|             elsif ($1 ne $::WIDTH)
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id active_width is $1. UNEXPECTED WIDTH.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id active_width is $1. UNEXPECTED WIDTH.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check speed
 | |
|         if ($line =~ /\S*:\s*active_speed:\s*(.+)\s*Gbps.*/ && $ignore == 0)
 | |
|         {
 | |
|             if (!defined($::HCASPEED))
 | |
|             {
 | |
|                 $speedhash{"$host: ib$if_id"} = $1;
 | |
|             }
 | |
|             elsif ($1 != $::HCASPEED)
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id active_speed is $1Gbps. UNEXPECTED SPEED.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id active_speed is $1Gbps. UNEXPECTED SPEED.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|         #Check Physical State
 | |
|         if ($line =~ /\S*:\s*phys_state:\s*(\w+)\s*\(.*/ && $ignore == 0)
 | |
|         {
 | |
|             if ($1 ne "LINK_UP")
 | |
|             {
 | |
|                 $abnormal = 1;
 | |
|                 print "$host: ib$if_id phys_state is $1. UNEXPECTED STATE.\n";
 | |
|                 print $::LOG_FILE_HANDLE "$host: ib$if_id phys_state is $1. UNEXPECTED STATE.\n";
 | |
|             }
 | |
|             next;
 | |
|         }
 | |
|     }
 | |
|     foreach my $key (sort keys %speedhash)
 | |
|     {
 | |
|         print "$key: Physical Port Speed: $speedhash{$key}G\n";
 | |
|         print $::LOG_FILE_HANDLE "$key: Physical Port Speed: $speedhash{$key}G\n";
 | |
|     }
 | |
|     foreach my $key (sort keys %widthhash)
 | |
|     {
 | |
|         print "$key: Physical Port Width: $widthhash{$key}\n";
 | |
|         print $::LOG_FILE_HANDLE "$key: Physical Port Width: $widthhash{$key}\n";
 | |
|     }
 | |
|     #All are normal
 | |
|     if(!$abnormal)
 | |
|     {
 | |
|         print "HCA status of all nodes is normal.\n";
 | |
|         print $::LOG_FILE_HANDLE "HCA status of all nodes is normal.\n";
 | |
|     }
 | |
|     return $::OK;
 | |
| }
 |