-Produce more precise output when a strange child reaping problem occurs, so far only seen in RHEL4 series.

git-svn-id: https://svn.code.sf.net/p/xcat/code/xcat-core/trunk@2318 8638fb3e-16cb-4fca-ae20-7b5d299a9bcd
This commit is contained in:
jbjohnso 2008-10-10 18:48:11 +00:00
parent 5d97efc398
commit 28e489ccc5

View File

@ -86,20 +86,23 @@ else {
my $children = 0;
my $inputs = new IO::Select;
my %pids; # pid => node
my %errored; # Keep a list of children with non-zero exit codes
my %exitcodes; # Keep a list of children with known exit codes
my %foundcodes;
# This happens whenever a child dies; keeps track of non-zero exit codes.
# This happens whenever a child dies; keeps track of exit codes.
sub handle_chld {
my $pid;
do {
my $pid = waitpid(-1,WNOHANG);
my $node = $pids{$pid};
my $exitc = $? >> 8;
if ($exitc) {
$errored{$node} = $exitc;
}
my $lasterr=$?>>8;
my $foundproc=0;
while (($pid = waitpid(-1,WNOHANG)) > 0) {
$foundproc=1;
$exitcodes{$pids{$pid}} = $? >> 8;
$children--;
} until ($pid <= 0);
}
unless ($foundproc) { #Above loop mysteriously had no children, store previous $? as a candidate for omitted return codes
$foundcodes{$lasterr}=1;
$children--; #Free a slot from the child that must have sent this signal
}
}
$SIG{CHLD} = \&handle_chld;
@ -122,12 +125,24 @@ while ($inputs->count and $children) {
}
while (processoutput($inputs)) {};
wait;
my $exitcode;
if (keys %errored) {
$exitcode = 2;
while (my ($node, $exitc) = each(%errored)) {
print stderr "$node: *** ssh exited with error code $exitc.\n";
}
my $exitcode=0;
foreach (values %pids) {
my $possible_codes = join ",",keys %foundcodes;
unless (defined $exitcodes{$_}) {
print stderr "$_: *** psh missed exit code, probably one of the following: $possible_codes\n";
}
}
foreach (keys %exitcodes) {
if ($exitcodes{$_}) {
print stderr "$_: *** ssh exited with error code ".$exitcodes{$_}.".\n";
$exitcode++;
}
}
if ($exitcode) { #Exit code reflects number of failed nodes
$exitcode=$exitcode%256; #keep from overflowing valid values
unless ($exitcode) { #if number of failed nodes happened to be evenly divisible by 256, make it non-zero again
$exitcode++;
}
}
exit($exitcode);