mirror of
				https://github.com/xcat2/xcat-core.git
				synced 2025-10-31 11:22:27 +00:00 
			
		
		
		
	added node status monitoring using RMC
git-svn-id: https://svn.code.sf.net/p/xcat/code/xcat-core/trunk@1731 8638fb3e-16cb-4fca-ae20-7b5d299a9bcd
This commit is contained in:
		| @@ -12,6 +12,7 @@ use Socket; | ||||
| use xCAT::Utils; | ||||
| use xCAT::GlobalDef; | ||||
| use xCAT_monitoring::monitorctrl; | ||||
| use xCAT_monitoring::xcatmon; | ||||
| use xCAT::MsgUtils; | ||||
|  | ||||
| #print "xCAT_monitoring::rmcmon loaded\n"; | ||||
| @@ -60,12 +61,13 @@ sub start { | ||||
|   if (! -e "/usr/bin/lsrsrc") { | ||||
|     return (1, "RSCT is not installed.\n"); | ||||
|   } | ||||
|   my $result=`/usr/bin/lssrc -s ctrmc`;   #TODO: change this | ||||
|   if ($result !~ /active/) { | ||||
|  | ||||
|   chomp(my $pid= `/bin/ps -ef | /bin/grep rmcd | /bin/grep -v grep | /bin/awk '{print \$2}'`); | ||||
|   unless($pid){ | ||||
|     #restart rmc daemon | ||||
|     $result=`startsrc -s ctrmc`; | ||||
|     if ($?) { | ||||
|       return (1, "rmc deamon cannot be started\n"); | ||||
|       return (1, "RMC deamon cannot be started\n"); | ||||
|     } | ||||
|   } | ||||
|  | ||||
| @@ -255,10 +257,135 @@ sub supportNodeStatusMon { | ||||
| #-------------------------------------------------------------------------------- | ||||
| sub startNodeStatusMon { | ||||
|   #print "rmcmon::startNodeStatusMon called\n"; | ||||
|   return (0, "started"); | ||||
|   my $retcode=0; | ||||
|   my $retmsg="started"; | ||||
|   my $isSV=xCAT::Utils->isServiceNode(); | ||||
|   if ($isSV) { return  ($retcode, $retmsg); }  | ||||
|  | ||||
|   #get all the nodes status from IBM.MngNode class of local host and  | ||||
|   #the identification of this node | ||||
|   my $noderef=xCAT_monitoring::monitorctrl->getMonHierarchy(); | ||||
|   my @hostinfo=xCAT::Utils->determinehostname(); | ||||
|   %iphash=(); | ||||
|   foreach(@hostinfo) {$iphash{$_}=1;} | ||||
|   if (!$isSV) { $iphash{'noservicenode'}=1;} | ||||
|  | ||||
|   my @servicenodes=(); | ||||
|   my %status_hash=(); | ||||
|   foreach my $key (keys (%$noderef)) { | ||||
|     my @key_a=split(',', $key); | ||||
|     if (! $iphash{$key_a[0]}) { push @servicenodes, $key_a[0]; }  | ||||
|     my $mon_nodes=$monservers->{$key}; | ||||
|     foreach(@$mon_nodes) { | ||||
|       my $node_info=$_; | ||||
|       $status_hash{$node_info->[0]}=$node_info->[2]; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   #get nodestatus from RMC and update the xCAT DB | ||||
|   ($retcode, $retmsg) = saveRMCNodeStatusToxCAT(\%status_hash); | ||||
|   if ($retcode != 0) { | ||||
|     $retmsg="Error occurred while updating xCAT node status from RMC data.:$retmsg"; | ||||
|     xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); | ||||
|   } | ||||
|   foreach (@servicenodes) { | ||||
|     ($retcode, $retmsg) = saveRMCNodeStatusToxCAT(\%status_hash, $_); | ||||
|     if ($retcode != 0) { | ||||
|       $retmsg="Error occurred while updating xCAT node status from RMC data from $_.:$retmsg"; | ||||
|       xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   #start monitoring the status of mn's immediate children | ||||
|   my $result=`startcondresp NodeReachability UpdatexCATNodeStatus 2>&1`; | ||||
|   if ($?) { | ||||
|     $retcode=$?; | ||||
|     $retmsg="Error start node status monitoring: $result"; | ||||
|     xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); | ||||
|   } | ||||
|  | ||||
|   #start monitoring the status of mn's grandchildren via their service nodes | ||||
|   $result=`startcondresp NodeReachability_H UpdatexCATNodeStatus 2>&1`; | ||||
|   if ($?) { | ||||
|     $retcode=$?; | ||||
|     $retmsg="Error start node status monitoring: $result"; | ||||
|     xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); | ||||
|   } | ||||
|   | ||||
|   return ($retcode, $retmsg); | ||||
| } | ||||
|  | ||||
|  | ||||
| #-------------------------------------------------------------------------------- | ||||
| =head3   saveRMCNodeStatusToxCAT | ||||
|     This function gets RMC node status and save them to xCAT database | ||||
|  | ||||
|     Arguments: | ||||
|         $oldstatus a pointer to a hash table that has the current node status | ||||
|         $node  the name of the service node to run RMC command from. If null, get from local host.  | ||||
|     Returns: | ||||
|         (return code, message) | ||||
| =cut | ||||
| #-------------------------------------------------------------------------------- | ||||
| sub saveRMCNodeStatusToxCAT { | ||||
|   #print "rmcmon::saveRMCNodeStatusToxCAT called\n"; | ||||
|   my $retcode=0; | ||||
|   my $retmsg="started"; | ||||
|   my $statusref=shift; | ||||
|   if ($statusref =~ /xCAT_monitoring::rmcmon/) { | ||||
|     $statusref=shift; | ||||
|   } | ||||
|   my $node=shift; | ||||
|  | ||||
|   %status_hash=%$statusref; | ||||
|  | ||||
|   #get all the node status from mn's children | ||||
|   my $result; | ||||
|   if ($node) { | ||||
|     $result=`lsrsrc-api -s IBM.MngNode::::Name::Status 2>&1`; | ||||
|   } else { | ||||
|     $result=`lsrsrc-api -s IBM.MngNode::::Name::Status 2>&1`; | ||||
|   } | ||||
|   if ($?) { | ||||
|     $retcode=$?; | ||||
|     $retmsg=$result; | ||||
|     xCAT::MsgUtils->message('SI', "[mon]: Error getting node status from RMC: $result\n"); | ||||
|     return ($retcode, $retmsg); | ||||
|   } else { | ||||
|     my @active_nodes=(); | ||||
|     my @inactive_nodes=(); | ||||
|     if ($result) { | ||||
|       my @lines=split('\n', $result); | ||||
|       #only save the ones that needs to change | ||||
|       foreach (@lines) { | ||||
| 	@pairs=split('::', $_); | ||||
|         if ($pairs[1]==1) {  | ||||
|           if ($status_hash{$pairs[0]} ne $::STATUS_ACTIVE) { push @active_nodes,$pairs[0];}  | ||||
|         } | ||||
|         else {  | ||||
|           if ($status_hash{$pairs[0]} ne $::STATUS_INACTIVE) { push @inactive_nodes, $pairs[0];} | ||||
|         }   | ||||
|       }  | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   my %new_node_status=(); | ||||
|   if (@active_nodes>0) { | ||||
|     $new_node_status{$::STATUS_ACTIVE}=\@active_nodes; | ||||
|   }  | ||||
|   if (@inactive_nodes>0) { | ||||
|     $new_node_status{$::STATUS_INACTIVE}=\@inactive_nodes; | ||||
|   } | ||||
|   #only set the node status for the changed ones | ||||
|   if (keys(%new_node_status) > 0) { | ||||
|     xCAT_monitoring::xcatmon::processNodeStatusChanges(\%new_node_status); | ||||
|   }   | ||||
|   return ($retcode, $retmsg); | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| #-------------------------------------------------------------------------------- | ||||
| =head3   stopNodeStatusMon | ||||
|     This function is called by the monitorctrl module to tell | ||||
| @@ -273,7 +400,27 @@ sub startNodeStatusMon { | ||||
| #-------------------------------------------------------------------------------- | ||||
| sub stopNodeStatusMon { | ||||
|   #print "rmcmon::stopNodeStatusMon called\n"; | ||||
|   return (0, "stopped"); | ||||
|   my $retcode=0; | ||||
|   my $retmsg="stopped"; | ||||
|   my $isSV=xCAT::Utils->isServiceNode(); | ||||
|   if ($isSV) { return  ($retcode, $retmsg); } | ||||
|   | ||||
|   #stop monitoring the status of mn's immediate children | ||||
|   my $result=`stopcondresp NodeReachability UpdatexCATNodeStatus 2>&1`; | ||||
|   if ($?) { | ||||
|     $retcode=$?; | ||||
|     $retmsg="Error stop node status monitoring: $result"; | ||||
|     xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); | ||||
|   } | ||||
|  | ||||
|   #stop monitoring the status of mn's grandchildren via their service nodes | ||||
|   $result=`stopcondresp NodeReachability_H UpdatexCATNodeStatus 2>&1`; | ||||
|   if ($?) { | ||||
|     $retcode=$?; | ||||
|     $retmsg="Error stop node status monitoring: $result"; | ||||
|     xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); | ||||
|   } | ||||
|   return ($retcode, $retmsg); | ||||
| } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -0,0 +1,8 @@ | ||||
| #!/usr/bin/perl | ||||
|  | ||||
| $RES::EventResponse{'UpdatexCATNodeStatus'} = { | ||||
| 	Name => q(UpdatexCATNodeStatus), | ||||
| 	Locked => q(0), | ||||
| 	Actions => q({[updatexCAT,{127},{0},{86400},/opt/xcat/sbin/rmcmon/updatexcatnodestatus,3,0,0,0,{},0]}), | ||||
| }; | ||||
| 1; | ||||
							
								
								
									
										67
									
								
								xCAT-rmc/scripts/updatexcatnodestatus
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										67
									
								
								xCAT-rmc/scripts/updatexcatnodestatus
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,67 @@ | ||||
| #!/usr/bin/env perl | ||||
| # IBM(c) 2007 EPL license http://www.eclipse.org/legal/epl-v10.html | ||||
|  | ||||
| # This script is used by UpdatexCATNodeStatus event response to handle the node | ||||
| # status changes from the condition NodeReachability and NodeReachability_H | ||||
|  | ||||
|  | ||||
| use strict; | ||||
| use Getopt::Std; | ||||
| use POSIX qw(strftime); | ||||
|  | ||||
| my $cond_name=$ENV{ERRM_COND_NAME}; | ||||
| my $node; | ||||
| my $status; | ||||
|  | ||||
| if ($cond_name eq "NodeReachability") { | ||||
|   $node=$ENV{ERRM_RSRC_NAME};   | ||||
|   $status=$ENV{ERRM_VALUE}; | ||||
| } elsif ($cond_name eq "NodeReachability_H")  { | ||||
|   # Parse the ERRM_VALUE attribute, which will contain the | ||||
|   # LastEvent structured data variable from the Condition class | ||||
|   # The fields in this structured data variable are documented below where we parse them out. | ||||
|   my $event = $ENV{ERRM_VALUE}; | ||||
|   $event =~ s/^\[(.*)\]$/$1/;              # SD variables have square brackets around them | ||||
|  | ||||
|   # This parse the LastEvent | ||||
|   my (                     # split the SD into the following fields: | ||||
| 	$Occurred,           # One if the condition has been triggered | ||||
| 	$ErrNum,             # Non-zero if there was in error in the event registration | ||||
| 	$ErrMsg,             # The string msg related to ErrNum | ||||
| 	$EventFlags,         # Bit mask giving some additional info about the event | ||||
| 	$EventTime,          # Time of event expressed in seconds since 1/1/1970 | ||||
| 	$EventTimeMicros,    # Number of microseconds past EventTime | ||||
| 	$ResourceHandle,     # Binary address of the RMC resource that caused the condition to be triggered | ||||
| 	$NodeName,           # The node on which the event occurred.  For conditions that use the management domain scope (4), | ||||
| 	                     # this will be the leaf node.  For conditions that use the local scope (e.g. NodeReachability), | ||||
| 	                     # this will be the FMS. | ||||
| 	$NumAttrs,           # Number of attr values from the resource returned in this event | ||||
| 	$NumAttrsInExpr,     # How many of the above were attributes in the event expression | ||||
| 	$IndexForAttrs,      # The starting index of the array of values.  Until new fixed fields are added | ||||
| 	                     # to LastEvent, this will be the element right after this one. | ||||
| 	$AttrArray           # This list of attribute names, types, and values | ||||
|      ) = split(/,/, $event, 12); | ||||
|  | ||||
|   my @attrArray = split(/,/, $AttrArray);      # Note: parsing this way does not support SDs or SD Arrays that may be in this list | ||||
|  | ||||
|   my $j = 0;      # index into attrArray | ||||
|   for (my $i=0; $i<$NumAttrs; $i++) { | ||||
|     my $attrName = $attrArray[$j++]; | ||||
|     my $attrType = $attrArray[$j++];          # Types <= 8 are "simple" types. Types > 8 are SDs and arrays. | ||||
|     my $attrValue = $attrArray[$j++]; | ||||
|     if ($attrName eq '"Name"') { $node = $attrValue; } | ||||
|     if ($attrName eq '"Status"') { $status = $attrValue; } | ||||
|   } | ||||
| } else { | ||||
|   print "this script does not handle condition $cond_name\n"; | ||||
|   exit 1; | ||||
| } | ||||
|  | ||||
|  | ||||
| open(CMD, "| wall") || die "Error:  can not start wall command.\n"; | ||||
| print CMD "node=$node status=$status\n"; | ||||
| close(CMD); | ||||
|  | ||||
| exit 0 | ||||
|  | ||||
|  | ||||
		Reference in New Issue
	
	Block a user