From ec02097b524e2430748359e300abd72c3df81062 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 19 May 2016 14:44:28 -0400 Subject: [PATCH] Explicitly check IPMI console health In theory, pyghmi should be doing a self-health check. It has been discovered at scale that this self-health check may encounter issues. For now, try to workaround by having another health check at the confluent level, deferred by console activity. It's also spaced far apart so it should not significantly add to idle load (one check every ~5 minutes, spread out). --- confluent_server/confluent/consoleserver.py | 2 ++ .../plugins/hardwaremanagement/ipmi.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/confluent_server/confluent/consoleserver.py b/confluent_server/confluent/consoleserver.py index c93b1c50..b809926f 100644 --- a/confluent_server/confluent/consoleserver.py +++ b/confluent_server/confluent/consoleserver.py @@ -306,6 +306,8 @@ class ConsoleHandler(object): self.reconnect = None self.connectstate = 'connecting' eventlet.spawn(self._connect) + else: + self._console.ping() diff --git a/confluent_server/confluent/plugins/hardwaremanagement/ipmi.py b/confluent_server/confluent/plugins/hardwaremanagement/ipmi.py index 9d687c79..e564fbdf 100644 --- a/confluent_server/confluent/plugins/hardwaremanagement/ipmi.py +++ b/confluent_server/confluent/plugins/hardwaremanagement/ipmi.py @@ -25,6 +25,7 @@ import eventlet.greenpool as greenpool import eventlet.queue as queue import pyghmi.constants as pygconstants import pyghmi.exceptions as pygexc +import random console = eventlet.import_patched('pyghmi.ipmi.console') ipmicommand = eventlet.import_patched('pyghmi.ipmi.command') import socket @@ -203,12 +204,18 @@ class IpmiConsole(conapi.Console): self.bmc = connparams['bmc'] self.port = connparams['port'] self.connected = False + self.checker = None # Cannot actually create console until 'connect', when we get callback def __del__(self): self.solconnection = None def handle_data(self, data): + if self.checker: + # if we got data from remote, delay the ping check + self.checker.cancel() + self.checker = eventlet.spawn_after(290 + random.random() * 30, + self.ping) if type(data) == dict: if 'error' in data: self.solconnection = None @@ -245,8 +252,14 @@ class IpmiConsole(conapi.Console): self.connected = True except socket.gaierror as err: raise exc.TargetEndpointUnreachable(str(err)) + if self.checker: + self.checker.cancel() + self.checker = eventlet.spawn_after(290 + random.random() * 30, + self.ping) def close(self): + if self.checker: + self.checker.cancel() if self.solconnection is not None: # break the circular reference here self.solconnection.out_handler = _donothing @@ -261,6 +274,12 @@ class IpmiConsole(conapi.Console): def send_break(self): self.solconnection.send_break() + def ping(self): + rsp = self.solconnection.ipmi_session.raw_command(netfn=6, + command=0x4b, + data=(1, 1)) + self.solconnection._got_payload_instance_info(rsp) + def perform_requests(operator, nodes, element, cfg, inputdata): cryptit = cfg.decrypt