From 2379f6f90f994f62a5235d6aed30cf9aa561004a Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 22 Feb 2023 08:41:46 -0500 Subject: [PATCH 1/7] Change nodesensors format of float Floats are either unnecessarily long in normal output, or too unconstrained in CSV output. Normalize to as many digits as 'makes sense' up to 5 digits. 5 miight seem a bit much, but one common metric is kWh, which may need that precision over short intervals. --- confluent_client/bin/nodesensors | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/confluent_client/bin/nodesensors b/confluent_client/bin/nodesensors index 3044a930..832a104a 100755 --- a/confluent_client/bin/nodesensors +++ b/confluent_client/bin/nodesensors @@ -135,7 +135,7 @@ def sensorpass(showout=True, appendtime=False): if sensedata['value'] is None: showval = '' elif isinstance(sensedata['value'], float): - showval = u' {0:.5f} '.format(sensedata['value']) + showval = u' {0:.5g} '.format(sensedata['value']) else: showval = u' {0} '.format(sensedata['value']) if sensedata['units'] not in (None, u''): @@ -191,6 +191,8 @@ def format_csv(csvwriter, orderedsensors, resdata, showtime=True): datum = ','.join([datum, healthstates]) else: datum = healthstates + if isinstance(datum, float): + datum = '{:.5g}'.format(datum) rowdata.append(datum) except KeyError: rowdata.append('N/A') From 6df2e822a5d0a1d35b5e313f5492cce5d1698a83 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 22 Feb 2023 09:34:32 -0500 Subject: [PATCH 2/7] Correct api call in discovery --- confluent_server/confluent/discovery/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confluent_server/confluent/discovery/core.py b/confluent_server/confluent/discovery/core.py index aae203c5..76135260 100644 --- a/confluent_server/confluent/discovery/core.py +++ b/confluent_server/confluent/discovery/core.py @@ -1184,7 +1184,7 @@ def search_smms_by_cert(currsmm, cert, cfg): if len(nl) == 1: return currsmm, bay, nl[0] return currsmm, bay, None - exnl = list(cfg.filter_node_attrubutes('enclosure.extends=' + currsmm)) + exnl = list(cfg.filter_node_attributes('enclosure.extends=' + currsmm)) if len(exnl) == 1: return search_smms_by_cert(exnl[0], cert, cfg) From b99034f53992bcb5692d81aa69be82af3681f9d3 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 22 Feb 2023 16:11:38 -0500 Subject: [PATCH 3/7] Improve reliability of collective join While servicing an enrollment, there's a window for a collective member to be 'defined' but not yet active, meaning quorum may transiently be lost as multiple enrollments progress. Serialize enrollments by holding the enrollment process open. Also, there is a chance that a transient transfer error may occur during loading of the DB. In such a case, restart the connection rather thn aborting. --- .../confluent/collective/manager.py | 73 +++++++++++-------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index 7e1584f8..07217049 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -61,10 +61,15 @@ class ContextBool(object): connecting = ContextBool() leader_init = ContextBool() +enrolling = ContextBool() def connect_to_leader(cert=None, name=None, leader=None, remote=None): global currentleader global follower + ocert = cert + oname = name + oleader = leader + oremote = remote if leader is None: leader = currentleader log.log({'info': 'Attempting connection to leader {0}'.format(leader), @@ -138,7 +143,9 @@ def connect_to_leader(cert=None, name=None, leader=None, remote=None): remote.close() except Exception: pass - raise Exception("Error doing initial DB transfer") + log.log({'error': 'Retrying connection, error during initial sync', 'subsystem': 'collective'}) + return connect_to_leader(ocert, oname, oleader, oremote) + raise Exception("Error doing initial DB transfer") # bad ssl write retry dbjson += ndata cfm.clear_configuration() try: @@ -390,34 +397,42 @@ def handle_connection(connection, cert, request, local=False): eventlet.spawn_n(connect_to_leader, rsp['collective'][ 'fingerprint'], name) if 'enroll' == operation: - #TODO(jjohnson2): error appropriately when asked to enroll, but the master is elsewhere - mycert = util.get_certificate_from_file('/etc/confluent/srvcert.pem') - proof = base64.b64decode(request['hmac']) - myrsp = invites.check_client_proof(request['name'], mycert, - cert, proof) - if not myrsp: - tlvdata.send(connection, {'error': 'Invalid token'}) - connection.close() - return - if not list(cfm.list_collective()): - # First enrollment of a collective, since the collective doesn't - # quite exist, then set initting false to let the enrollment action - # drive this particular initialization - initting = False - myrsp = base64.b64encode(myrsp) - fprint = util.get_fingerprint(cert) - myfprint = util.get_fingerprint(mycert) - cfm.add_collective_member(get_myname(), - connection.getsockname()[0], myfprint) - cfm.add_collective_member(request['name'], - connection.getpeername()[0], fprint) - myleader = get_leader(connection) - ldrfprint = cfm.get_collective_member_by_address( - myleader)['fingerprint'] - tlvdata.send(connection, - {'collective': {'approval': myrsp, - 'fingerprint': ldrfprint, - 'leader': get_leader(connection)}}) + with enrolling: + cfm.check_quorum() + mycert = util.get_certificate_from_file('/etc/confluent/srvcert.pem') + proof = base64.b64decode(request['hmac']) + myrsp = invites.check_client_proof(request['name'], mycert, + cert, proof) + if not myrsp: + tlvdata.send(connection, {'error': 'Invalid token'}) + connection.close() + return + if not list(cfm.list_collective()): + # First enrollment of a collective, since the collective doesn't + # quite exist, then set initting false to let the enrollment action + # drive this particular initialization + initting = False + myrsp = base64.b64encode(myrsp) + fprint = util.get_fingerprint(cert) + myfprint = util.get_fingerprint(mycert) + cfm.add_collective_member(get_myname(), + connection.getsockname()[0], myfprint) + cfm.add_collective_member(request['name'], + connection.getpeername()[0], fprint) + myleader = get_leader(connection) + ldrfprint = cfm.get_collective_member_by_address( + myleader)['fingerprint'] + tlvdata.send(connection, + {'collective': {'approval': myrsp, + 'fingerprint': ldrfprint, + 'leader': get_leader(connection)}}) + havequorum = False + while not havequorum: + try: + cfm.check_quorum() + havequorum = True + except exc.DegradedCollective: + eventlet.sleep(0.1) if 'assimilate' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) From 5ea214a7268201379d82f8b51328bc2181f3030e Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 22 Feb 2023 16:34:13 -0500 Subject: [PATCH 4/7] Use eventlet subprocess sshutil uses eventlet subprocess, making calledprocesserror hard to catch. Adjust to consistently use same subprocesss module. --- confluent_server/bin/confluent_selfcheck | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confluent_server/bin/confluent_selfcheck b/confluent_server/bin/confluent_selfcheck index 845bb107..6a9d5163 100755 --- a/confluent_server/bin/confluent_selfcheck +++ b/confluent_server/bin/confluent_selfcheck @@ -15,7 +15,7 @@ import confluent.sshutil as sshutil import confluent.certutil as certutil import confluent.client as client import confluent.config.configmanager as configmanager -import subprocess +import eventlet.green.subprocess as subprocess import tempfile import shutil import eventlet.green.socket as socket From 59b07665ab5a12c2517be2950556d395498c1fda Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Fri, 24 Feb 2023 12:03:43 -0500 Subject: [PATCH 5/7] Modify float formatting again Make sure at least one decimal is in a float. Maximum precision of 5 past. --- confluent_client/bin/nodesensors | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/confluent_client/bin/nodesensors b/confluent_client/bin/nodesensors index 832a104a..119df18c 100755 --- a/confluent_client/bin/nodesensors +++ b/confluent_client/bin/nodesensors @@ -37,6 +37,12 @@ class hybridcsv(csv.excel): lineterminator = '\n' +def floatformat(num): + fm = u'{:.5f}'.format(num).rstrip('0') + if fm[-1:] == u'.': + return fm + u'0' + return fm + csv.register_dialect('hybrid', hybridcsv) import confluent.client as client @@ -135,7 +141,7 @@ def sensorpass(showout=True, appendtime=False): if sensedata['value'] is None: showval = '' elif isinstance(sensedata['value'], float): - showval = u' {0:.5g} '.format(sensedata['value']) + showval = floatformat(sensedata['value']) else: showval = u' {0} '.format(sensedata['value']) if sensedata['units'] not in (None, u''): @@ -192,7 +198,7 @@ def format_csv(csvwriter, orderedsensors, resdata, showtime=True): else: datum = healthstates if isinstance(datum, float): - datum = '{:.5g}'.format(datum) + datum = floatformat(datum) rowdata.append(datum) except KeyError: rowdata.append('N/A') From 70d8a1059c0489df5c1c09cc5510d6b7a3214d8c Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Fri, 24 Feb 2023 15:47:20 -0500 Subject: [PATCH 6/7] Consistently treat bytes as bytes in ssh In Python3 systems, there would be confusion about bytes versus str. Fix this so that ssh can work more consistently. --- confluent_server/confluent/plugins/shell/ssh.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/confluent_server/confluent/plugins/shell/ssh.py b/confluent_server/confluent/plugins/shell/ssh.py index db9f8097..d88ca9b2 100644 --- a/confluent_server/confluent/plugins/shell/ssh.py +++ b/confluent_server/confluent/plugins/shell/ssh.py @@ -146,7 +146,7 @@ class SshShell(conapi.Console): return except cexc.PubkeyInvalid as pi: self.ssh.close() - self.keyaction = '' + self.keyaction = b'' self.candidatefprint = pi.fingerprint self.datacallback(pi.message) self.keyattrname = pi.attrname @@ -197,18 +197,18 @@ class SshShell(conapi.Console): delidx = data.index(b'\x7f') data = data[:delidx - 1] + data[delidx + 1:] self.keyaction += data - if '\r' in self.keyaction: - action = self.keyaction.split('\r')[0] - if action.lower() == 'accept': + if b'\r' in self.keyaction: + action = self.keyaction.split(b'\r')[0] + if action.lower() == b'accept': self.nodeconfig.set_node_attributes( {self.node: {self.keyattrname: self.candidatefprint}}) self.datacallback('\r\n') self.logon() - elif action.lower() == 'disconnect': + elif action.lower() == b'disconnect': self.datacallback(conapi.ConsoleEvent.Disconnect) else: - self.keyaction = '' + self.keyaction = b'' self.datacallback('\r\nEnter "disconnect" or "accept": ') elif len(data) > 0: self.datacallback(data) From 9f7e53701e2e52750a695d5ec85c67a36f675e5d Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 27 Feb 2023 10:43:40 -0500 Subject: [PATCH 7/7] Avoid latching onto USB nic in a vswitch as 'the nic' In esxi, some builds may have USB nic brought up in a vswitch. Detect and avoid that scenario. --- confluent_osdeploy/esxi7/profiles/hypervisor/scripts/makeksnet | 2 ++ 1 file changed, 2 insertions(+) diff --git a/confluent_osdeploy/esxi7/profiles/hypervisor/scripts/makeksnet b/confluent_osdeploy/esxi7/profiles/hypervisor/scripts/makeksnet index 7d46a53e..eeb72ef7 100644 --- a/confluent_osdeploy/esxi7/profiles/hypervisor/scripts/makeksnet +++ b/confluent_osdeploy/esxi7/profiles/hypervisor/scripts/makeksnet @@ -33,6 +33,8 @@ for info in vswinfo.split('\n'): upinfo = uplinkmatch.match(info) if upinfo: vmnic = upinfo.group(1) + if vmnic and 'vusb0' not in vmnic: + break try: with open('/tmp/confluentident/cnflnt.jsn') as identin: identcfg = json.load(identin)