Added support for hierachy in RMC monitoring

git-svn-id: https://svn.code.sf.net/p/xcat/code/xcat-core/trunk@1714 8638fb3e-16cb-4fca-ae20-7b5d299a9bcd
This commit is contained in:
linggao 2008-06-20 14:23:40 +00:00
parent 90ba172438
commit 6a76562fdd
28 changed files with 376 additions and 4 deletions

View File

@ -118,7 +118,7 @@ sub start {
elsif ($summary{$_}==-1) {push(@nodes_to_remove, $_);}
}
#add new nodes to the RMC cluster
#add new nodes to the RMC cluster
#print "all nodes to add: @nodes_to_add\nall nodes to remove: @nodes_to_remove\n";
if (@nodes_to_add>0) {
my %nodes_status=xCAT_monitoring::rmcmon->pingNodeStatus(@nodes_to_add);
@ -139,8 +139,16 @@ sub start {
removeNodes_noChecking(@nodes_to_remove);
}
#start condition-response assosciations
my $result=`$::XCATROOT/sbin/rmcmon/mkrmcresources $::XCATROOT/lib/perl/xCAT_monitoring/rmc/resources/mn 2>&1`;
#create conditions/responses/sensors on the service node or mn
my $result=`$::XCATROOT/sbin/rmcmon/mkrmcresources $::XCATROOT/lib/perl/xCAT_monitoring/rmc/resources/sn 2>&1`;
if ($?) {
xCAT::MsgUtils->message('SI', "[mon]: Error when creating predefined resources on $localhostname:\n$result\n");
}
if ($isSV) {
$result=`$::XCATROOT/sbin/rmcmon/mkrmcresources $::XCATROOT/lib/perl/xCAT_monitoring/rmc/resources/node 2>&1`;
} else {
$result=`$::XCATROOT/sbin/rmcmon/mkrmcresources $::XCATROOT/lib/perl/xCAT_monitoring/rmc/resources/mn 2>&1`;
}
if ($?) {
xCAT::MsgUtils->message('SI', "[mon]: Error when creating predefined resources on $localhostname:\n$result\n");
}

View File

@ -0,0 +1,14 @@
#!/usr/bin/perl
$RES::Condition{'AIXNodeCoreDump_H'} = {
Name => q(AIXNodeCoreDump_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AIXNodeCoreDump events from the service nodes. An event will be generated when a core dump is logged in the AIX Error log of a node in the cluster.),
SelectionString => q(Name="AIXNodeCoreDump"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,14 @@
#!/usr/bin/perl
$RES::Condition{'AllServiceableEvents_H'} = {
Name => q(AllServiceableEvents_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AllServiceableEvents events from the service nodes. An event will be generated whenever there is outpout from running sensor related to any serviceable events.),
SelectionString => q(Name="AllServiceableEvents"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,14 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeAnyLoggedError_H'} = {
Name => q(AnyNodeAnyLoggedError_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeAnyLoggedError events from the service nodes. An event will be generated when an error is logged to either the AIX Error Log or the Linux Syslog of a node in the cluster.),
SelectionString => q(Name="AnyNodeAnyLoggedError"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeFileSystemInodesUsed_H'} = {
Name => q(AnyNodeFileSystemInodesUsed_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeFileSystemInodesUsed events from the service nodes. An event will be generated when more than 90 percent of the total inodes in the file system is in use.),
RearmDescription => q(A rearm event will be generated when the percentage of the inodes used in the file system falls below 75 percent."),
SelectionString => q(Name="AnyNodeFileSystemInodesUsed"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeFileSystemSpaceUsed_H'} = {
Name => q(AnyNodeFileSystemSpaceUsed_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeFileSystemSpaceUsed events from the service nodes. An event will be generated when more than 90 percent of the total space in the file system is in use.),
RearmDescription => q(A rearm event will be generated when the percentage of the space used in the file system falls below 75 percent.),
SelectionString => q(Name="AnyNodeFileSystemSpaceUsed"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeNetworkInterfaceStatus_H'} = {
Name => q(AnyNodeNetworkInterfaceStatus_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeNetworkInterfaceStatus events from the service nodes. An event will be generated whenever any network interface on the node is not online.),
RearmDescription => q(A rearm event will be generated when the network interface on the node becomes online again.),
SelectionString => q(Name="AnyNodeNetworkInterfaceStatus"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodePagingPercentSpaceFree_H'} = {
Name => q(AnyNodePagingPercentSpaceFree_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodePagingPercentSpaceFree events from the service nodes. An event will be generated when the total amount of free paging space falls below 10 percent.),
RearmDescription => q(A rearm event will be generated when the free paging space increases to 15 percent.),
SelectionString => q(Name="AnyNodePagingPercentSpaceFree"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeProcessorsIdleTime_H'} = {
Name => q(AnyNodeProcessorsIdleTime_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeProcessorsIdleTime events from the service nodes. An event will be generated when the average time all processors are idle at least 70 percent of the time.),
RearmDescription => q(A rearm event will be generated when the idle time decreases below 10 percent.),
SelectionString => q(Name="AnyNodeProcessorsIdleTime"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeTmpSpaceUsed_H'} = {
Name => q(AnyNodeTmpSpaceUsed_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeTmpSpaceUsed events from the service nodes. An event will be generated when more than 90 percent of the total space in the /tmp file system is in use.),
RearmDescription => q(A rearm event will be generated when the percentage of the space used in the /tmp file system falls below 75 percent.),
SelectionString => q(Name="AnyNodeTmpSpaceUsed"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -0,0 +1,15 @@
#!/usr/bin/perl
$RES::Condition{'AnyNodeVarSpaceUsed_H'} = {
Name => q(AnyNodeVarSpaceUsed_H),
ResourceClass => q(IBM.Condition),
EventExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 0x0233) == 0),
RearmExpression => q(LastEvent.Occurred==1 && LastEvent.ErrNum==0 && (LastEvent.EventFlags & 3) ==1),
EventDescription => q(This condition collects all the AnyNodeVarSpaceUsed events from the service nodes. An event will be generated when more than 90 percent of the total space in the /var file system is in use.),
RearmDescription => q(A rearm event will be generated when the percentage of the space used in the /var file system falls below 75 percent.),
SelectionString => q(Name="AnyNodeVarSpaceUsed"),
ManagementScope => q(4),
Severity => q(0),
NoToggleExprFlag => q(1),
};
1;

View File

@ -3,6 +3,6 @@
$RES::EventResponse{'BroadcastEventsAnyTime'} = {
Name => q(BroadcastEventsAnyTime),
Locked => q(0),
Actions => q({[wallEvent,{127},{0},{86400},/usr/sbin/rsct/bin/walleven,3,0,0,0,{},0]}),
Actions => q({[wallEvent,{127},{0},{86400},/usr/sbin/rsct/bin/wallevent,3,0,0,0,{},0]}),
};
1;

View File

@ -0,0 +1,8 @@
#!/usr/bin/perl
$RES::EventResponse{'EmailRootHierarchicalEvents'} = {
Name => q(EmailRootHierarchicalEvents),
Locked => q(0),
Actions => q({[emailEvent,{127},{0},{86400},/opt/xcat/sbin/rmcmon/email-hierarchical-event root,3,0,0,0,{},0]}),
};
1;

View File

@ -0,0 +1,8 @@
#!/usr/bin/perl
$RES::EventResponse{'MsgHierarchicalEvents'} = {
Name => q(MsgHierarchicalEvents),
Locked => q(0),
Actions => q({[msgEvent,{127},{0},{86400},/opt/xcat/sbin/rmcmon/msg-hierarchical-event,3,0,0,0,{},0]}),
};
1;

View File

@ -0,0 +1,100 @@
#!/usr/bin/env perl
# IBM(c) 2007 EPL license http://www.eclipse.org/legal/epl-v10.html
# Script to demonstrate how to parse out the event info that comes from a
# Condition that is watching another Condition in a hierarchical cluster environment.
# To use this script, create a Response that invokes this script with email address as the input.
# Then associate the Response with a Condition that is watching Conditions on other
# nodes. When the Conditions occur, this script will wall the relevant event information.
# This info that is passed into this sample is structured as follows:
# EMS Condition/Event:
# $ENV{ERRM_COND_NAME}
# $ENV{ERRM_TYPEID}
# ...
# $ENV{ERRM_NODE_NAME} --> SN
# $ENV{ERRM_RSRC_NAME} --> SN Condition
# $ENV{ERRM_VALUE} --> SN Event:
# $Occurred
# $ErrNum
# ...
# $NodeName --> Leaf Node:
# $ResourceName --> Resource
# $Attribute 1 --> Attr 1
# $Attribute 2 --> Attr 2
use strict;
use Getopt::Std;
use POSIX qw(strftime);
my $user=shift;
# Convert Severity and Type environment variables from integers to strings
my @COND_SEVERITY = qw(Informational Warning Critical);
my @TYPE = ('event', 'rearm event');
my $severity=$COND_SEVERITY[$ENV{ERRM_COND_SEVERITYID}];
my $type=$TYPE[$ENV{ERRM_TYPEID }];
# Parse the ERRM_VALUE attribute, which will contain the EventOccurred structured data variable from the Condition class
# The fields in this structured data variable are documented below where we parse them out.
my $event = $ENV{ERRM_VALUE};
$event =~ s/^\[(.*)\]$/$1/; # SD variables have square brackets around them
# This parse the LastEvent
my ( # split the SD into the following fields:
$Occurred, # One if the condition has been triggered
$ErrNum, # Non-zero if there was in error in the event registration
$ErrMsg, # The string msg related to ErrNum
$EventFlags, # Bit mask giving some additional info about the event
$EventTime, # Time of event expressed in seconds since 1/1/1970
$EventTimeMicros, # Number of microseconds past EventTime
$ResourceHandle, # Binary address of the RMC resource that caused the condition to be triggered
$NodeName, # The node on which the event occurred. For conditions that use the management domain scope (4),
# this will be the leaf node. For conditions that use the local scope (e.g. NodeReachability),
# this will be the FMS.
$NumAttrs, # Number of attr values from the resource returned in this event
$NumAttrsInExpr, # How many of the above were attributes in the event expression
$IndexForAttrs, # The starting index of the array of values. Until new fixed fields are added
# to LastEvent, this will be the element right after this one.
$AttrArray # This list of attribute names, types, and values
) = split(/,/, $event, 12);
my @attrArray = split(/,/, $AttrArray); # Note: parsing this way does not support SDs or SD Arrays that may be in this list
my ($ResourceName, $valuesMsg);
my $j = 0; # index into attrArray
for (my $i=0; $i<$NumAttrs; $i++) {
my $attrName = $attrArray[$j++];
my $attrType = $attrArray[$j++]; # Types <= 8 are "simple" types. Types > 8 are SDs and arrays.
my $attrValue = $attrArray[$j++];
if ($attrName eq '"Name"') { $ResourceName = $attrValue; }
$valuesMsg .= " Attribute Value $i: $attrName = $attrValue\n";
}
if (!length($ResourceName)) { $ResourceName = '(unknown)'; }
my $msg;
$msg .= "The following $TYPE[$ENV{ERRM_TYPEID}] occurred:\n";
$msg .= " MN Condition: $ENV{ERRM_COND_NAME}\n";
$msg .= " SN: $ENV{ERRM_NODE_NAME}\n"; # do we have to use $ERRM_NODE_NAMELIST here?
$msg .= " SN Condition: $ENV{ERRM_RSRC_NAME}\n";
$msg .= " Node: $NodeName\n";
$msg .= " Resource Name: $ResourceName\n";
$msg .= " Event Time: " . convertTime($EventTime) . "\n";
if (length($valuesMsg)) {
$msg .= " Attributes that came in the event for condition $ENV{ERRM_RSRC_NAME} from node $NodeName for resource $ResourceName:\n";
$msg .= $valuesMsg;
}
# Skipped the following: $ERRM_EXPR $ERRM_RSRC_CLASS_PNAME $ERRM_DATA_TYPE $ERRM_NODE_NAMELIST $ERRM_RSRC_TYPE
`echo "$msg"| mail -s "$severity $type: $ENV{ERRM_COND_NAME}" $user`;
exit;
# convert time string
sub convertTime {
my ($seconds, $micro) = split(/\./, $_[0]);
return strftime("%A %D %T", localtime($seconds));
}

View File

@ -0,0 +1,101 @@
#!/usr/bin/env perl
# IBM(c) 2007 EPL license http://www.eclipse.org/legal/epl-v10.html
# Script to demonstrate how to parse out the event info that comes from a
# Condition that is watching another Condition in a hierarchical cluster environment.
# To use this script, create a Response that invokes this script with email address as the input.
# Then associate the Response with a Condition that is watching Conditions on other
# nodes. When the Conditions occur, this script will wall the relevant event information.
# This info that is passed into this sample is structured as follows:
# EMS Condition/Event:
# $ENV{ERRM_COND_NAME}
# $ENV{ERRM_TYPEID}
# ...
# $ENV{ERRM_NODE_NAME} --> SN
# $ENV{ERRM_RSRC_NAME} --> SN Condition
# $ENV{ERRM_VALUE} --> SN Event:
# $Occurred
# $ErrNum
# ...
# $NodeName --> Leaf Node:
# $ResourceName --> Resource
# $Attribute 1 --> Attr 1
# $Attribute 2 --> Attr 2
use strict;
use Getopt::Std;
use POSIX qw(strftime);
# Convert Severity and Type environment variables from integers to strings
my @COND_SEVERITY = qw(Informational Warning Critical);
my @TYPE = ('Event', 'Rearm event');
my $severity=$COND_SEVERITY[$ENV{ERRM_COND_SEVERITYID}];
my $type=$TYPE[$ENV{ERRM_TYPEID }];
# Parse the ERRM_VALUE attribute, which will contain the EventOccurred structured data variable from the Condition class
# The fields in this structured data variable are documented below where we parse them out.
my $event = $ENV{ERRM_VALUE};
$event =~ s/^\[(.*)\]$/$1/; # SD variables have square brackets around them
# This parse the LastEvent
my ( # split the SD into the following fields:
$Occurred, # One if the condition has been triggered
$ErrNum, # Non-zero if there was in error in the event registration
$ErrMsg, # The string msg related to ErrNum
$EventFlags, # Bit mask giving some additional info about the event
$EventTime, # Time of event expressed in seconds since 1/1/1970
$EventTimeMicros, # Number of microseconds past EventTime
$ResourceHandle, # Binary address of the RMC resource that caused the condition to be triggered
$NodeName, # The node on which the event occurred. For conditions that use the management domain scope (4),
# this will be the leaf node. For conditions that use the local scope (e.g. NodeReachability),
# this will be the FMS.
$NumAttrs, # Number of attr values from the resource returned in this event
$NumAttrsInExpr, # How many of the above were attributes in the event expression
$IndexForAttrs, # The starting index of the array of values. Until new fixed fields are added
# to LastEvent, this will be the element right after this one.
$AttrArray # This list of attribute names, types, and values
) = split(/,/, $event, 12);
my @attrArray = split(/,/, $AttrArray); # Note: parsing this way does not support SDs or SD Arrays that may be in this list
my ($ResourceName, $valuesMsg);
my $j = 0; # index into attrArray
for (my $i=0; $i<$NumAttrs; $i++) {
my $attrName = $attrArray[$j++];
my $attrType = $attrArray[$j++]; # Types <= 8 are "simple" types. Types > 8 are SDs and arrays.
my $attrValue = $attrArray[$j++];
if ($attrName eq '"Name"') { $ResourceName = $attrValue; }
$valuesMsg .= " Attribute Value $i: $attrName = $attrValue\n";
}
if (!length($ResourceName)) { $ResourceName = '(unknown)'; }
my $msg;
$msg .= "$severity $type occurred at " . convertTime($EventTime) . ":\n";
$msg .= " MN Condition: $ENV{ERRM_COND_NAME}\n";
$msg .= " SN: $ENV{ERRM_NODE_NAME}\n"; # do we have to use $ERRM_NODE_NAMELIST here?
$msg .= " SN Condition: $ENV{ERRM_RSRC_NAME}\n";
$msg .= " Node: $NodeName\n";
$msg .= " Resource Name: $ResourceName\n";
if (length($valuesMsg)) {
$msg .= " Attributes:\n";
$msg .= $valuesMsg;
}
# Skipped the following: $ERRM_EXPR $ERRM_RSRC_CLASS_PNAME $ERRM_DATA_TYPE $ERRM_NODE_NAMELIST $ERRM_RSRC_TYPE
open(CMD, "| wall") || die "Error: can not start wall command.\n";
print CMD $msg;
close(CMD);
exit;
# convert time string
sub convertTime {
my ($seconds, $micro) = split(/\./, $_[0]);
return strftime("%A %D %T", localtime($seconds));
}