diff --git a/xCAT-rmc/plugin/rmcmon.pm b/xCAT-rmc/plugin/rmcmon.pm index ced5c1140..fc1ab3a83 100644 --- a/xCAT-rmc/plugin/rmcmon.pm +++ b/xCAT-rmc/plugin/rmcmon.pm @@ -12,6 +12,7 @@ use Socket; use xCAT::Utils; use xCAT::GlobalDef; use xCAT_monitoring::monitorctrl; +use xCAT_monitoring::xcatmon; use xCAT::MsgUtils; #print "xCAT_monitoring::rmcmon loaded\n"; @@ -60,12 +61,13 @@ sub start { if (! -e "/usr/bin/lsrsrc") { return (1, "RSCT is not installed.\n"); } - my $result=`/usr/bin/lssrc -s ctrmc`; #TODO: change this - if ($result !~ /active/) { + + chomp(my $pid= `/bin/ps -ef | /bin/grep rmcd | /bin/grep -v grep | /bin/awk '{print \$2}'`); + unless($pid){ #restart rmc daemon $result=`startsrc -s ctrmc`; if ($?) { - return (1, "rmc deamon cannot be started\n"); + return (1, "RMC deamon cannot be started\n"); } } @@ -255,10 +257,135 @@ sub supportNodeStatusMon { #-------------------------------------------------------------------------------- sub startNodeStatusMon { #print "rmcmon::startNodeStatusMon called\n"; - return (0, "started"); + my $retcode=0; + my $retmsg="started"; + my $isSV=xCAT::Utils->isServiceNode(); + if ($isSV) { return ($retcode, $retmsg); } + + #get all the nodes status from IBM.MngNode class of local host and + #the identification of this node + my $noderef=xCAT_monitoring::monitorctrl->getMonHierarchy(); + my @hostinfo=xCAT::Utils->determinehostname(); + %iphash=(); + foreach(@hostinfo) {$iphash{$_}=1;} + if (!$isSV) { $iphash{'noservicenode'}=1;} + + my @servicenodes=(); + my %status_hash=(); + foreach my $key (keys (%$noderef)) { + my @key_a=split(',', $key); + if (! $iphash{$key_a[0]}) { push @servicenodes, $key_a[0]; } + my $mon_nodes=$monservers->{$key}; + foreach(@$mon_nodes) { + my $node_info=$_; + $status_hash{$node_info->[0]}=$node_info->[2]; + } + } + + #get nodestatus from RMC and update the xCAT DB + ($retcode, $retmsg) = saveRMCNodeStatusToxCAT(\%status_hash); + if ($retcode != 0) { + $retmsg="Error occurred while updating xCAT node status from RMC data.:$retmsg"; + xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); + } + foreach (@servicenodes) { + ($retcode, $retmsg) = saveRMCNodeStatusToxCAT(\%status_hash, $_); + if ($retcode != 0) { + $retmsg="Error occurred while updating xCAT node status from RMC data from $_.:$retmsg"; + xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); + } + } + + #start monitoring the status of mn's immediate children + my $result=`startcondresp NodeReachability UpdatexCATNodeStatus 2>&1`; + if ($?) { + $retcode=$?; + $retmsg="Error start node status monitoring: $result"; + xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); + } + + #start monitoring the status of mn's grandchildren via their service nodes + $result=`startcondresp NodeReachability_H UpdatexCATNodeStatus 2>&1`; + if ($?) { + $retcode=$?; + $retmsg="Error start node status monitoring: $result"; + xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); + } + + return ($retcode, $retmsg); } +#-------------------------------------------------------------------------------- +=head3 saveRMCNodeStatusToxCAT + This function gets RMC node status and save them to xCAT database + + Arguments: + $oldstatus a pointer to a hash table that has the current node status + $node the name of the service node to run RMC command from. If null, get from local host. + Returns: + (return code, message) +=cut +#-------------------------------------------------------------------------------- +sub saveRMCNodeStatusToxCAT { + #print "rmcmon::saveRMCNodeStatusToxCAT called\n"; + my $retcode=0; + my $retmsg="started"; + my $statusref=shift; + if ($statusref =~ /xCAT_monitoring::rmcmon/) { + $statusref=shift; + } + my $node=shift; + + %status_hash=%$statusref; + + #get all the node status from mn's children + my $result; + if ($node) { + $result=`lsrsrc-api -s IBM.MngNode::::Name::Status 2>&1`; + } else { + $result=`lsrsrc-api -s IBM.MngNode::::Name::Status 2>&1`; + } + if ($?) { + $retcode=$?; + $retmsg=$result; + xCAT::MsgUtils->message('SI', "[mon]: Error getting node status from RMC: $result\n"); + return ($retcode, $retmsg); + } else { + my @active_nodes=(); + my @inactive_nodes=(); + if ($result) { + my @lines=split('\n', $result); + #only save the ones that needs to change + foreach (@lines) { + @pairs=split('::', $_); + if ($pairs[1]==1) { + if ($status_hash{$pairs[0]} ne $::STATUS_ACTIVE) { push @active_nodes,$pairs[0];} + } + else { + if ($status_hash{$pairs[0]} ne $::STATUS_INACTIVE) { push @inactive_nodes, $pairs[0];} + } + } + } + } + + my %new_node_status=(); + if (@active_nodes>0) { + $new_node_status{$::STATUS_ACTIVE}=\@active_nodes; + } + if (@inactive_nodes>0) { + $new_node_status{$::STATUS_INACTIVE}=\@inactive_nodes; + } + #only set the node status for the changed ones + if (keys(%new_node_status) > 0) { + xCAT_monitoring::xcatmon::processNodeStatusChanges(\%new_node_status); + } + return ($retcode, $retmsg); +} + + + + #-------------------------------------------------------------------------------- =head3 stopNodeStatusMon This function is called by the monitorctrl module to tell @@ -273,7 +400,27 @@ sub startNodeStatusMon { #-------------------------------------------------------------------------------- sub stopNodeStatusMon { #print "rmcmon::stopNodeStatusMon called\n"; - return (0, "stopped"); + my $retcode=0; + my $retmsg="stopped"; + my $isSV=xCAT::Utils->isServiceNode(); + if ($isSV) { return ($retcode, $retmsg); } + + #stop monitoring the status of mn's immediate children + my $result=`stopcondresp NodeReachability UpdatexCATNodeStatus 2>&1`; + if ($?) { + $retcode=$?; + $retmsg="Error stop node status monitoring: $result"; + xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); + } + + #stop monitoring the status of mn's grandchildren via their service nodes + $result=`stopcondresp NodeReachability_H UpdatexCATNodeStatus 2>&1`; + if ($?) { + $retcode=$?; + $retmsg="Error stop node status monitoring: $result"; + xCAT::MsgUtils->message('SI', "[mon]: $retmsg\n"); + } + return ($retcode, $retmsg); } diff --git a/xCAT-rmc/resources/mn/IBM.EventResponse/UpdatexCATNodeStatus.pm b/xCAT-rmc/resources/mn/IBM.EventResponse/UpdatexCATNodeStatus.pm new file mode 100644 index 000000000..a6fddf56f --- /dev/null +++ b/xCAT-rmc/resources/mn/IBM.EventResponse/UpdatexCATNodeStatus.pm @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +$RES::EventResponse{'UpdatexCATNodeStatus'} = { + Name => q(UpdatexCATNodeStatus), + Locked => q(0), + Actions => q({[updatexCAT,{127},{0},{86400},/opt/xcat/sbin/rmcmon/updatexcatnodestatus,3,0,0,0,{},0]}), +}; +1; diff --git a/xCAT-rmc/scripts/updatexcatnodestatus b/xCAT-rmc/scripts/updatexcatnodestatus new file mode 100755 index 000000000..f5c89badf --- /dev/null +++ b/xCAT-rmc/scripts/updatexcatnodestatus @@ -0,0 +1,67 @@ +#!/usr/bin/env perl +# IBM(c) 2007 EPL license http://www.eclipse.org/legal/epl-v10.html + +# This script is used by UpdatexCATNodeStatus event response to handle the node +# status changes from the condition NodeReachability and NodeReachability_H + + +use strict; +use Getopt::Std; +use POSIX qw(strftime); + +my $cond_name=$ENV{ERRM_COND_NAME}; +my $node; +my $status; + +if ($cond_name eq "NodeReachability") { + $node=$ENV{ERRM_RSRC_NAME}; + $status=$ENV{ERRM_VALUE}; +} elsif ($cond_name eq "NodeReachability_H") { + # Parse the ERRM_VALUE attribute, which will contain the + # LastEvent structured data variable from the Condition class + # The fields in this structured data variable are documented below where we parse them out. + my $event = $ENV{ERRM_VALUE}; + $event =~ s/^\[(.*)\]$/$1/; # SD variables have square brackets around them + + # This parse the LastEvent + my ( # split the SD into the following fields: + $Occurred, # One if the condition has been triggered + $ErrNum, # Non-zero if there was in error in the event registration + $ErrMsg, # The string msg related to ErrNum + $EventFlags, # Bit mask giving some additional info about the event + $EventTime, # Time of event expressed in seconds since 1/1/1970 + $EventTimeMicros, # Number of microseconds past EventTime + $ResourceHandle, # Binary address of the RMC resource that caused the condition to be triggered + $NodeName, # The node on which the event occurred. For conditions that use the management domain scope (4), + # this will be the leaf node. For conditions that use the local scope (e.g. NodeReachability), + # this will be the FMS. + $NumAttrs, # Number of attr values from the resource returned in this event + $NumAttrsInExpr, # How many of the above were attributes in the event expression + $IndexForAttrs, # The starting index of the array of values. Until new fixed fields are added + # to LastEvent, this will be the element right after this one. + $AttrArray # This list of attribute names, types, and values + ) = split(/,/, $event, 12); + + my @attrArray = split(/,/, $AttrArray); # Note: parsing this way does not support SDs or SD Arrays that may be in this list + + my $j = 0; # index into attrArray + for (my $i=0; $i<$NumAttrs; $i++) { + my $attrName = $attrArray[$j++]; + my $attrType = $attrArray[$j++]; # Types <= 8 are "simple" types. Types > 8 are SDs and arrays. + my $attrValue = $attrArray[$j++]; + if ($attrName eq '"Name"') { $node = $attrValue; } + if ($attrName eq '"Status"') { $status = $attrValue; } + } +} else { + print "this script does not handle condition $cond_name\n"; + exit 1; +} + + +open(CMD, "| wall") || die "Error: can not start wall command.\n"; +print CMD "node=$node status=$status\n"; +close(CMD); + +exit 0 + +