2
0
mirror of https://github.com/xcat2/xcat-core.git synced 2025-05-29 09:13:08 +00:00

Add resource check in xcatmn (#5202)

* Add more checking for xcat mn healthy
This commit is contained in:
xuweibj 2018-05-25 16:29:22 +08:00 committed by Bin Xu
parent 699486a15c
commit eb346fb752

View File

@ -57,10 +57,12 @@ sub do_main_job {
my $checkpoint;
my $rc = 0;
my $installnicip;
my $flag = "w";
#check if all xcat deamons are running
$rst = check_all_xcat_deamons(\$checkpoint, \@error);
print_check_result($checkpoint, "f", $rst, \@error);
#check if all xcat daemons are running
($rst, $flag) = check_all_xcat_daemons(\$checkpoint, \@error);
print_check_result($checkpoint, $flag, $rst, \@error);
$rst = 0 if ($flag == "w");
return $rst if ($rst);
#check if xcatd can receive request
@ -136,10 +138,24 @@ sub do_main_job {
print_check_result($checkpoint, "w", $rst, \@error);
$rc |= $rst;
#check linux ulimits configuration
($rst, $flag) = check_ulimits(\$checkpoint, \@error);
print_check_result($checkpoint, $flag, $rst, \@error);
$rc |= $rst;
#check network parameter configuration
($rst, $flag) = check_network_parameter(\$checkpoint, \@error);
print_check_result($checkpoint, $flag, $rst, \@error);
$rc |= $rst;
#some sepecific check points in MN
if (!$is_sn) {
#check xCAT daemon attributes configuration
($rst, $flag) = check_daemon_attributes(\$checkpoint, \@error);
print_check_result($checkpoint, $flag, $rst, \@error);
$rc |= $rst;
#check if log can be recorded in log file
$rst = check_log_record(\$checkpoint, \@error);
print_check_result($checkpoint, "w", $rst, \@error);
@ -267,15 +283,16 @@ sub print_check_result {
}
sub check_all_xcat_deamons {
sub check_all_xcat_daemons {
my $checkpoint_ref = shift;
my $error_ref = shift;
my $rst = 0;
my $rst_type = "f";
$$checkpoint_ref = "Checking all xCAT deamons are running...";
$$checkpoint_ref = "Checking all xCAT daemons are running...";
@$error_ref = ();
my @deamon_list = ("SSL listener",
my @daemon_list = ("SSL listener",
"DB Access",
"UDP listener",
"install monitor",
@ -283,14 +300,28 @@ sub check_all_xcat_deamons {
"Command log writer");
my $output = `ps aux 2>&1|grep -v grep|grep xcatd`;
foreach my $deamon (@deamon_list) {
if ($output !~ /$deamon/) {
push @$error_ref, "Deamon '$deamon' isn't running";
foreach my $daemon (@daemon_list) {
my $counter = $output =~ s/$daemon/$daemon/g;
if ($counter > 1) {
if ($daemon == "SSL listener") {
my $cur_pid = `cat /var/run/xcatd.pid`;
my @ssl_pids = `ps aux 2>&1|grep -v grep|grep "xcatd: $daemon"|awk -F' ' '{print \$2}'`;
foreach my $ssl_pid (@ssl_pids) {
next if ($cur_pid == $ssl_pid);
my $child_pid = `ps --ppid $ssl_pid 2>&1 | grep "xcatd SSL:" | wc -l`;
chomp($child_pid);
$rst_type = "w" if ($child_pid);
}
}
push @$error_ref, "More Daemon '$daemon' is running";
$rst = 1;
} elsif ($counter == 0) {
push @$error_ref, "Daemon '$daemon' isn't running";
$rst = 1;
}
}
return $rst;
return ($rst, $rst_type);
}
sub check_xcatd_receive_request {
@ -917,7 +948,7 @@ sub check_dhcp_service {
# on sn, just check dhcpd service whether running
my $dhcpoutput = `ps aux 2>&1| grep dhcpd |grep -v grep`;
if (!$dhcpoutput) {
push @$error_ref, "There isn't 'dhcpd' deamon in current server";
push @$error_ref, "There isn't 'dhcpd' daemon in current server";
$rst = 1;
}
} else {
@ -1042,6 +1073,104 @@ sub check_dhcp_leases {
return $rst;
}
sub check_ulimits {
my $checkpoint_ref = shift;
my $error_ref = shift;
my $rst = 0;
my $rst_type = "w";
$$checkpoint_ref = "Checking Linux ulimits configuration...";
@$error_ref = ();
my $nofile_num = `sh -c 'ulimit -n' 2>&1`;
chomp($nofile_num);
my $process_id = `cat /var/run/xcatd.pid`;
chomp($process_id);
my $process_folder = "/proc/$process_id/fd/";
my $open_num = 0;
$open_num = `ls $process_folder | wc -l` if (-e $process_folder);
chomp($open_num);
return ($rst, $rst_type) unless($open_num);
my $percent = $open_num/$nofile_num;
unless ($percent < 0.8) {
push @$error_ref, "The number of open files is not enough for xcatd service, increase the limits for it according to xCAT document";
$rst = 1;
if ($percent >= 1) {
$rst_type = "f";
}
}
return ($rst, $rst_type);
}
sub check_network_parameter {
my $checkpoint_ref = shift;
my $error_ref = shift;
my $rst = 0;
my $rst_type = "w";
$$checkpoint_ref = "Checking network kernel parameter configuration...";
@$error_ref = ();
my $net_set_file = "/etc/sysctl.conf";
my $net_gc_thresh = 512;
my $net_set_value = `sysctl -n net.ipv4.neigh.default.gc_thresh2`;
chomp($net_set_value);
$net_gc_thresh = $net_set_value if ($net_set_value);
my $arp_num = `arp -a | wc -l`;
chomp($arp_num);
my $percent = $arp_num/$net_gc_thresh;
unless ($percent < 0.8) {
push @$error_ref, "Most ARP has been used, please tuning network parameter as document";
$rst = 1;
if ($percent >= 1) {
$rst_type = "f";
}
}
return ($rst, $rst_type);
}
sub check_daemon_attributes {
my $checkpoint_ref = shift;
my $error_ref = shift;
my $rst = 0;
$rst_type = "w";
$$checkpoint_ref = "Checking xCAT daemon attributes configuration...";
@$error_ref = ();
my $node_num = `nodels 2>&1 | wc -l`;
chomp($node_num);
my $xcatmaxconnections = 64;
my $xcatmaxbatchconnections = 50;
my @site_max_info = `lsdef -t site -i xcatmaxconnections,xcatmaxbatchconnections -c 2>&1`;
foreach my $site_max (@site_max_info) {
if ($site_max =~ /xcatmaxconnections=(\d+)/) {
$xcatmaxconnections_site = $1;
}
if ($site_max =~ /xcatmaxbatchconnections=(\d+)/) {
$xcatmaxbatchconnections_site = $1;
}
}
if ($xcatmaxconnections_site <= $xcatmaxbatchconnections_site) {
push @$error_ref, "Attribute xcatmaxbatchconnections must be less than xcatmaxconnections.";
$rst = 1;
$rst_type = "f";
} elsif ($xcatmaxconnections_site < $xcatmaxconnections or
$xcatmaxbatchconnections_site < $xcatmaxbatchconnections and
$node_num >= 500) {
push @$error_ref, "Management nodes are more than 500, please tuning xCAT daemon attributes as document";
$rst = 1;
}
return ($rst, $rst_type);
}
sub returncmdoutput {
my $rst = shift;
my $error_ref = shift;
@ -1135,7 +1264,7 @@ while ($hierarchy_instance->read_reply(\%reply_cache)) {
#print ">>>$reply_cache{$servers}->[$_]<<<\n";
#For cases like below:
#c910f02c04p04: [ok] :All xCAT deamons are running
#c910f02c04p04: [ok] :All xCAT daemons are running
if ($reply_cache{$servers}->[$_] =~ /^(\w+)\s*:\s*(\[\w+\]\s*):\s*(.*)/) {
if ("$1" eq "$server") {
$logmsg = "$2: $3";