mirror of
https://github.com/xcat2/xcat-core.git
synced 2025-05-29 09:13:08 +00:00
Add resource check in xcatmn (#5202)
* Add more checking for xcat mn healthy
This commit is contained in:
parent
699486a15c
commit
eb346fb752
@ -57,10 +57,12 @@ sub do_main_job {
|
||||
my $checkpoint;
|
||||
my $rc = 0;
|
||||
my $installnicip;
|
||||
my $flag = "w";
|
||||
|
||||
#check if all xcat deamons are running
|
||||
$rst = check_all_xcat_deamons(\$checkpoint, \@error);
|
||||
print_check_result($checkpoint, "f", $rst, \@error);
|
||||
#check if all xcat daemons are running
|
||||
($rst, $flag) = check_all_xcat_daemons(\$checkpoint, \@error);
|
||||
print_check_result($checkpoint, $flag, $rst, \@error);
|
||||
$rst = 0 if ($flag == "w");
|
||||
return $rst if ($rst);
|
||||
|
||||
#check if xcatd can receive request
|
||||
@ -136,10 +138,24 @@ sub do_main_job {
|
||||
print_check_result($checkpoint, "w", $rst, \@error);
|
||||
$rc |= $rst;
|
||||
|
||||
#check linux ulimits configuration
|
||||
($rst, $flag) = check_ulimits(\$checkpoint, \@error);
|
||||
print_check_result($checkpoint, $flag, $rst, \@error);
|
||||
$rc |= $rst;
|
||||
|
||||
#check network parameter configuration
|
||||
($rst, $flag) = check_network_parameter(\$checkpoint, \@error);
|
||||
print_check_result($checkpoint, $flag, $rst, \@error);
|
||||
$rc |= $rst;
|
||||
|
||||
#some sepecific check points in MN
|
||||
if (!$is_sn) {
|
||||
|
||||
#check xCAT daemon attributes configuration
|
||||
($rst, $flag) = check_daemon_attributes(\$checkpoint, \@error);
|
||||
print_check_result($checkpoint, $flag, $rst, \@error);
|
||||
$rc |= $rst;
|
||||
|
||||
#check if log can be recorded in log file
|
||||
$rst = check_log_record(\$checkpoint, \@error);
|
||||
print_check_result($checkpoint, "w", $rst, \@error);
|
||||
@ -267,15 +283,16 @@ sub print_check_result {
|
||||
|
||||
}
|
||||
|
||||
sub check_all_xcat_deamons {
|
||||
sub check_all_xcat_daemons {
|
||||
my $checkpoint_ref = shift;
|
||||
my $error_ref = shift;
|
||||
my $rst = 0;
|
||||
my $rst_type = "f";
|
||||
|
||||
$$checkpoint_ref = "Checking all xCAT deamons are running...";
|
||||
$$checkpoint_ref = "Checking all xCAT daemons are running...";
|
||||
@$error_ref = ();
|
||||
|
||||
my @deamon_list = ("SSL listener",
|
||||
my @daemon_list = ("SSL listener",
|
||||
"DB Access",
|
||||
"UDP listener",
|
||||
"install monitor",
|
||||
@ -283,14 +300,28 @@ sub check_all_xcat_deamons {
|
||||
"Command log writer");
|
||||
|
||||
my $output = `ps aux 2>&1|grep -v grep|grep xcatd`;
|
||||
foreach my $deamon (@deamon_list) {
|
||||
if ($output !~ /$deamon/) {
|
||||
push @$error_ref, "Deamon '$deamon' isn't running";
|
||||
foreach my $daemon (@daemon_list) {
|
||||
my $counter = $output =~ s/$daemon/$daemon/g;
|
||||
if ($counter > 1) {
|
||||
if ($daemon == "SSL listener") {
|
||||
my $cur_pid = `cat /var/run/xcatd.pid`;
|
||||
my @ssl_pids = `ps aux 2>&1|grep -v grep|grep "xcatd: $daemon"|awk -F' ' '{print \$2}'`;
|
||||
foreach my $ssl_pid (@ssl_pids) {
|
||||
next if ($cur_pid == $ssl_pid);
|
||||
my $child_pid = `ps --ppid $ssl_pid 2>&1 | grep "xcatd SSL:" | wc -l`;
|
||||
chomp($child_pid);
|
||||
$rst_type = "w" if ($child_pid);
|
||||
}
|
||||
}
|
||||
push @$error_ref, "More Daemon '$daemon' is running";
|
||||
$rst = 1;
|
||||
} elsif ($counter == 0) {
|
||||
push @$error_ref, "Daemon '$daemon' isn't running";
|
||||
$rst = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return $rst;
|
||||
return ($rst, $rst_type);
|
||||
}
|
||||
|
||||
sub check_xcatd_receive_request {
|
||||
@ -917,7 +948,7 @@ sub check_dhcp_service {
|
||||
# on sn, just check dhcpd service whether running
|
||||
my $dhcpoutput = `ps aux 2>&1| grep dhcpd |grep -v grep`;
|
||||
if (!$dhcpoutput) {
|
||||
push @$error_ref, "There isn't 'dhcpd' deamon in current server";
|
||||
push @$error_ref, "There isn't 'dhcpd' daemon in current server";
|
||||
$rst = 1;
|
||||
}
|
||||
} else {
|
||||
@ -1042,6 +1073,104 @@ sub check_dhcp_leases {
|
||||
return $rst;
|
||||
}
|
||||
|
||||
sub check_ulimits {
|
||||
my $checkpoint_ref = shift;
|
||||
my $error_ref = shift;
|
||||
my $rst = 0;
|
||||
my $rst_type = "w";
|
||||
|
||||
$$checkpoint_ref = "Checking Linux ulimits configuration...";
|
||||
@$error_ref = ();
|
||||
|
||||
my $nofile_num = `sh -c 'ulimit -n' 2>&1`;
|
||||
chomp($nofile_num);
|
||||
|
||||
my $process_id = `cat /var/run/xcatd.pid`;
|
||||
chomp($process_id);
|
||||
my $process_folder = "/proc/$process_id/fd/";
|
||||
|
||||
my $open_num = 0;
|
||||
$open_num = `ls $process_folder | wc -l` if (-e $process_folder);
|
||||
chomp($open_num);
|
||||
return ($rst, $rst_type) unless($open_num);
|
||||
|
||||
my $percent = $open_num/$nofile_num;
|
||||
unless ($percent < 0.8) {
|
||||
push @$error_ref, "The number of open files is not enough for xcatd service, increase the limits for it according to xCAT document";
|
||||
$rst = 1;
|
||||
if ($percent >= 1) {
|
||||
$rst_type = "f";
|
||||
}
|
||||
}
|
||||
return ($rst, $rst_type);
|
||||
}
|
||||
|
||||
sub check_network_parameter {
|
||||
my $checkpoint_ref = shift;
|
||||
my $error_ref = shift;
|
||||
my $rst = 0;
|
||||
my $rst_type = "w";
|
||||
|
||||
$$checkpoint_ref = "Checking network kernel parameter configuration...";
|
||||
@$error_ref = ();
|
||||
|
||||
my $net_set_file = "/etc/sysctl.conf";
|
||||
my $net_gc_thresh = 512;
|
||||
my $net_set_value = `sysctl -n net.ipv4.neigh.default.gc_thresh2`;
|
||||
chomp($net_set_value);
|
||||
$net_gc_thresh = $net_set_value if ($net_set_value);
|
||||
|
||||
my $arp_num = `arp -a | wc -l`;
|
||||
chomp($arp_num);
|
||||
|
||||
my $percent = $arp_num/$net_gc_thresh;
|
||||
unless ($percent < 0.8) {
|
||||
push @$error_ref, "Most ARP has been used, please tuning network parameter as document";
|
||||
$rst = 1;
|
||||
if ($percent >= 1) {
|
||||
$rst_type = "f";
|
||||
}
|
||||
}
|
||||
return ($rst, $rst_type);
|
||||
}
|
||||
|
||||
sub check_daemon_attributes {
|
||||
my $checkpoint_ref = shift;
|
||||
my $error_ref = shift;
|
||||
my $rst = 0;
|
||||
$rst_type = "w";
|
||||
|
||||
$$checkpoint_ref = "Checking xCAT daemon attributes configuration...";
|
||||
@$error_ref = ();
|
||||
|
||||
my $node_num = `nodels 2>&1 | wc -l`;
|
||||
chomp($node_num);
|
||||
my $xcatmaxconnections = 64;
|
||||
my $xcatmaxbatchconnections = 50;
|
||||
|
||||
my @site_max_info = `lsdef -t site -i xcatmaxconnections,xcatmaxbatchconnections -c 2>&1`;
|
||||
foreach my $site_max (@site_max_info) {
|
||||
if ($site_max =~ /xcatmaxconnections=(\d+)/) {
|
||||
$xcatmaxconnections_site = $1;
|
||||
}
|
||||
if ($site_max =~ /xcatmaxbatchconnections=(\d+)/) {
|
||||
$xcatmaxbatchconnections_site = $1;
|
||||
}
|
||||
}
|
||||
|
||||
if ($xcatmaxconnections_site <= $xcatmaxbatchconnections_site) {
|
||||
push @$error_ref, "Attribute xcatmaxbatchconnections must be less than xcatmaxconnections.";
|
||||
$rst = 1;
|
||||
$rst_type = "f";
|
||||
} elsif ($xcatmaxconnections_site < $xcatmaxconnections or
|
||||
$xcatmaxbatchconnections_site < $xcatmaxbatchconnections and
|
||||
$node_num >= 500) {
|
||||
push @$error_ref, "Management nodes are more than 500, please tuning xCAT daemon attributes as document";
|
||||
$rst = 1;
|
||||
}
|
||||
return ($rst, $rst_type);
|
||||
}
|
||||
|
||||
sub returncmdoutput {
|
||||
my $rst = shift;
|
||||
my $error_ref = shift;
|
||||
@ -1135,7 +1264,7 @@ while ($hierarchy_instance->read_reply(\%reply_cache)) {
|
||||
|
||||
#print ">>>$reply_cache{$servers}->[$_]<<<\n";
|
||||
#For cases like below:
|
||||
#c910f02c04p04: [ok] :All xCAT deamons are running
|
||||
#c910f02c04p04: [ok] :All xCAT daemons are running
|
||||
if ($reply_cache{$servers}->[$_] =~ /^(\w+)\s*:\s*(\[\w+\]\s*):\s*(.*)/) {
|
||||
if ("$1" eq "$server") {
|
||||
$logmsg = "$2: $3";
|
||||
|
Loading…
x
Reference in New Issue
Block a user