From a3b0b0240dc5ef048e9cce8d077552aed1961279 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 27 Sep 2018 10:30:30 -0400 Subject: [PATCH 01/20] Abort assimilation attempt on non-member cleanly If a confluent instance has forgotten the collective, more cleanly handle the situation, and abort the assimilation rather than assuming the peer should be leader, unless txcount specifically is called out as the reason. --- confluent_server/confluent/collective/manager.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index 735d9b1b..dae9279c 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -322,6 +322,11 @@ def handle_connection(connection, cert, request, local=False): if 'assimilate' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) + if not droneinfo: + tlvdata.send(connection, + {'error': 'Unrecognized leader, ' + 'redo invitation process'}) + return if not util.cert_matches(droneinfo['fingerprint'], cert): tlvdata.send(connection, {'error': 'Invalid certificate, ' @@ -432,7 +437,7 @@ def try_assimilate(drone): tlvdata.recv(remote) # the banner tlvdata.recv(remote) # authpassed... 0.. answer = tlvdata.recv(remote) - if answer and 'error' in answer: + if answer and 'txcount' in answer: connect_to_leader(None, None, leader=remote.getpeername()[0]) def get_leader(connection): From 0724ad812b924ec433fa48a6a90573cca31bf79b Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 27 Sep 2018 10:38:00 -0400 Subject: [PATCH 02/20] Add logging to the assimilation phase of collective When attempting assimilation, provide logging about the attempt. --- .../confluent/collective/manager.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index dae9279c..9c864bfc 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -18,6 +18,7 @@ import base64 import confluent.collective.invites as invites import confluent.config.configmanager as cfm import confluent.exceptions as exc +import confluent.log as log import confluent.tlvdata as tlvdata import confluent.util as util import eventlet @@ -437,8 +438,24 @@ def try_assimilate(drone): tlvdata.recv(remote) # the banner tlvdata.recv(remote) # authpassed... 0.. answer = tlvdata.recv(remote) - if answer and 'txcount' in answer: + if not answer: + log.log( + {'error': + 'No answer from {0} while trying to assimilate'.format( + drone)}) + return + if 'txcount' in answer: + log.log({'info': 'Deferring to {0} due to transaction count'.format( + drone)}) connect_to_leader(None, None, leader=remote.getpeername()[0]) + return + if 'error' in answer: + log.log({ + 'error': 'Error encountered while attempting to ' + 'assimilate {0}: {1}'.format(drone, answer['error'])}) + return + log.log({'Assimilated {0} into collective'.format(drone)}) + def get_leader(connection): if currentleader is None or connection.getpeername()[0] == currentleader: From 10ce7a9de91839493eeb3dce0a988b84af8ce9f1 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 27 Sep 2018 10:49:57 -0400 Subject: [PATCH 03/20] Add more logging to collective process --- .../confluent/collective/manager.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index 9c864bfc..d4a4cdbf 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -62,7 +62,9 @@ def connect_to_leader(cert=None, name=None, leader=None): leader = currentleader try: remote = connect_to_collective(cert, leader) - except socket.error: + except socket.error as e: + log.log({'error': 'Collective connection attempt to {0} failed: {1}' + ''.format(leader, str(e))}) return False with connecting: with cfginitlock: @@ -78,10 +80,16 @@ def connect_to_leader(cert=None, name=None, leader=None): return False if 'error' in keydata: if 'backoff' in keydata: + log.log({ + 'info': 'Collective initialization in progress on ' + '{0}, will retry connection'.format(leader)}) eventlet.spawn_after(random.random(), connect_to_leader, cert, name, leader) return True if 'leader' in keydata: + log.log( + {'info': 'Prospective leader {0} has redirected this ' + 'member to {1}'.format(leader, keydata['leader'])}) ldrc = cfm.get_collective_member_by_address( keydata['leader']) if ldrc and ldrc['name'] == name: @@ -89,6 +97,10 @@ def connect_to_leader(cert=None, name=None, leader=None): return connect_to_leader(name=name, leader=keydata['leader']) if 'txcount' in keydata: + log.log({'info': + 'Prospective leader {0} has inferior ' + 'transaction count, becoming leader' + ''.format(leader)}) return become_leader(remote) print(keydata['error']) return False @@ -96,6 +108,7 @@ def connect_to_leader(cert=None, name=None, leader=None): follower.kill() cfm.stop_following() follower = None + log.log({'info': 'Following leader {0}'.format(leader)}) colldata = tlvdata.recv(remote) globaldata = tlvdata.recv(remote) dbi = tlvdata.recv(remote) @@ -138,6 +151,8 @@ def follow_leader(remote): try: cfm.follow_channel(remote) finally: + log.log({'info': 'Current leader has disappeared, restarting ' + 'collective membership'}) # The leader has folded, time to startup again... cfm.stop_following() currentleader = None From e57cdf9a7b2cc916108a40670d133c209d31b776 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 27 Sep 2018 15:14:45 -0400 Subject: [PATCH 04/20] Add more collective event log handling More detail to analyze how the collective membership is handled. --- confluent_server/confluent/collective/manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index d4a4cdbf..dcb11717 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -60,6 +60,7 @@ def connect_to_leader(cert=None, name=None, leader=None): cfginitlock = threading.RLock() if leader is None: leader = currentleader + log.log({'info': 'Attempting connection to leader {0}'.format(leader)}) try: remote = connect_to_collective(cert, leader) except socket.error as e: @@ -104,7 +105,6 @@ def connect_to_leader(cert=None, name=None, leader=None): return become_leader(remote) print(keydata['error']) return False - if follower is not None: follower.kill() cfm.stop_following() follower = None @@ -485,6 +485,7 @@ def retire_as_leader(): def become_leader(connection): global currentleader global follower + log.log({'info': 'Becoming leader of collective'}) if follower: follower.kill() follower = None From 61e7c90ad1e16c0e5309bfbc92f8747d4de7eefa Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 1 Oct 2018 09:35:02 -0400 Subject: [PATCH 05/20] Do not restart on intentional kill Additionally, add some output to help filter events log --- .../confluent/collective/manager.py | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index dcb11717..5f326591 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -25,6 +25,7 @@ import eventlet import eventlet.green.socket as socket import eventlet.green.ssl as ssl import eventlet.green.threading as threading +import greenlet import random try: import OpenSSL.crypto as crypto @@ -51,7 +52,6 @@ class ContextBool(object): connecting = ContextBool() leader_init = ContextBool() - def connect_to_leader(cert=None, name=None, leader=None): global currentleader global cfginitlock @@ -60,12 +60,14 @@ def connect_to_leader(cert=None, name=None, leader=None): cfginitlock = threading.RLock() if leader is None: leader = currentleader - log.log({'info': 'Attempting connection to leader {0}'.format(leader)}) + log.log({'info': 'Attempting connection to leader {0}'.format(leader), + 'subsystem': 'collective'}) try: remote = connect_to_collective(cert, leader) except socket.error as e: log.log({'error': 'Collective connection attempt to {0} failed: {1}' - ''.format(leader, str(e))}) + ''.format(leader, str(e)), + 'subsystem': 'collective'}) return False with connecting: with cfginitlock: @@ -83,14 +85,16 @@ def connect_to_leader(cert=None, name=None, leader=None): if 'backoff' in keydata: log.log({ 'info': 'Collective initialization in progress on ' - '{0}, will retry connection'.format(leader)}) + '{0}, will retry connection'.format(leader), + 'subsystem': 'collective'}) eventlet.spawn_after(random.random(), connect_to_leader, cert, name, leader) return True if 'leader' in keydata: log.log( {'info': 'Prospective leader {0} has redirected this ' - 'member to {1}'.format(leader, keydata['leader'])}) + 'member to {1}'.format(leader, keydata['leader']), + 'subsystem': 'collective'}) ldrc = cfm.get_collective_member_by_address( keydata['leader']) if ldrc and ldrc['name'] == name: @@ -108,7 +112,8 @@ def connect_to_leader(cert=None, name=None, leader=None): follower.kill() cfm.stop_following() follower = None - log.log({'info': 'Following leader {0}'.format(leader)}) + log.log({'info': 'Following leader {0}'.format(leader), + 'subsystem': 'collective'}) colldata = tlvdata.recv(remote) globaldata = tlvdata.recv(remote) dbi = tlvdata.recv(remote) @@ -148,11 +153,18 @@ def connect_to_leader(cert=None, name=None, leader=None): def follow_leader(remote): global currentleader + cleanexit = False try: cfm.follow_channel(remote) + except greenlet.GreenletExit: + cleanexit = True finally: + if cleanexit: + log.log({'info': 'Previous following cleanly closed', + 'subsystem': 'collective'}) + return log.log({'info': 'Current leader has disappeared, restarting ' - 'collective membership'}) + 'collective membership', 'subsystem': 'collective'}) # The leader has folded, time to startup again... cfm.stop_following() currentleader = None @@ -457,19 +469,22 @@ def try_assimilate(drone): log.log( {'error': 'No answer from {0} while trying to assimilate'.format( - drone)}) + drone), + 'subsystem': 'collective'}) return if 'txcount' in answer: log.log({'info': 'Deferring to {0} due to transaction count'.format( - drone)}) + drone), 'subsystem': 'collective'}) connect_to_leader(None, None, leader=remote.getpeername()[0]) return if 'error' in answer: log.log({ 'error': 'Error encountered while attempting to ' - 'assimilate {0}: {1}'.format(drone, answer['error'])}) + 'assimilate {0}: {1}'.format(drone, answer['error']), + 'subsystem': 'collective'}) return - log.log({'Assimilated {0} into collective'.format(drone)}) + log.log({'info': 'Assimilated {0} into collective'.format(drone), + 'subsystem': 'collective'}) def get_leader(connection): @@ -485,7 +500,8 @@ def retire_as_leader(): def become_leader(connection): global currentleader global follower - log.log({'info': 'Becoming leader of collective'}) + log.log({'info': 'Becoming leader of collective', + 'subsystem': 'collective'}) if follower: follower.kill() follower = None From e098c0ba91fbdc2031aad5821127d379d6c00234 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 1 Oct 2018 14:03:45 -0400 Subject: [PATCH 06/20] Fix missing tenant argument on user management function The tenant was omitted preventing those particular rpc calls from working correctly. --- confluent_server/confluent/config/configmanager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/confluent_server/confluent/config/configmanager.py b/confluent_server/confluent/config/configmanager.py index d5e7d1e5..e9258eb4 100644 --- a/confluent_server/confluent/config/configmanager.py +++ b/confluent_server/confluent/config/configmanager.py @@ -1232,9 +1232,9 @@ class ConfigManager(object): def del_user(self, name): if cfgleader: - return exec_on_leader('_rpc_master_del_user', name) + return exec_on_leader('_rpc_master_del_user', self.tenant, name) if cfgstreams: - exec_on_followers('_rpc_del_user', name) + exec_on_followers('_rpc_del_user', self.tenant, name) self._true_del_user(name) def _true_del_user(self, name): From 326f56219b0d4128b86d24c8d7b64f1a370a8b24 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 1 Oct 2018 14:40:02 -0400 Subject: [PATCH 07/20] Fix /networking/macs/by-mac The module apimacmap was not correctly scoped. --- confluent_server/confluent/networking/macmap.py | 1 + 1 file changed, 1 insertion(+) diff --git a/confluent_server/confluent/networking/macmap.py b/confluent_server/confluent/networking/macmap.py index 933a873e..ec4759c6 100644 --- a/confluent_server/confluent/networking/macmap.py +++ b/confluent_server/confluent/networking/macmap.py @@ -297,6 +297,7 @@ def _finish_update(completions): def _full_updatemacmap(configmanager): global vintage + global _apimacmap global _macmap global _nodesbymac global _switchportmap From b0b5493ff7db65d0f21a4ce0468251b42e2cb626 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 1 Oct 2018 15:29:18 -0400 Subject: [PATCH 08/20] Cancel retry if we become leader If an instance is first to start, it's retry should be canceled when other members prod it to become leader. --- confluent_server/confluent/collective/manager.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index 5f326591..bf851376 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -500,11 +500,15 @@ def retire_as_leader(): def become_leader(connection): global currentleader global follower + global retrythread log.log({'info': 'Becoming leader of collective', 'subsystem': 'collective'}) if follower: follower.kill() follower = None + if retrythread: + retrythread.cancel() + retrythread = None currentleader = connection.getsockname()[0] skipaddr = connection.getpeername()[0] myname = get_myname() From 4329c1d38851e4d2a107ee634a39f582da299284 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 1 Oct 2018 15:50:31 -0400 Subject: [PATCH 09/20] Have collective start bail out if leader Leader should not relinquish if quorum, so don't bother in such a case. --- confluent_server/confluent/collective/manager.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index bf851376..70275d66 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -535,6 +535,12 @@ def start_collective(): if follower: follower.kill() follower = None + try: + if cfm.cfgstreams and cfm.check_quorum(): + # Do not start if we have quorum and are leader + return + except exc.DegradedCollective: + pass if leader_init.active: # do not start trying to connect if we are # xmitting data to a follower return From 78a1741e0e91b3297d8754eb6c231a048ed62dc0 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 1 Oct 2018 16:02:16 -0400 Subject: [PATCH 10/20] Fix usage of check_quorum() It is not a boolean, it is exception driven. --- confluent_server/confluent/collective/manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index 70275d66..33b87323 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -536,7 +536,8 @@ def start_collective(): follower.kill() follower = None try: - if cfm.cfgstreams and cfm.check_quorum(): + if cfm.cfgstreams: + cfm.check_quorum() # Do not start if we have quorum and are leader return except exc.DegradedCollective: From d86e1fc4eb0d6e67f2c66b3d0d4b42497c9bd28c Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Tue, 2 Oct 2018 10:17:44 -0400 Subject: [PATCH 11/20] Give the cfg init a lock Move collective manager and configmanager to share a configinitlock, so that bad timings during internal initialization and collective activity cannot interfere and produce corrupt database. This became an issue with the fix for 'everything' disappearing. --- .../confluent/collective/manager.py | 9 +--- .../confluent/config/configmanager.py | 54 ++++++++++--------- 2 files changed, 29 insertions(+), 34 deletions(-) diff --git a/confluent_server/confluent/collective/manager.py b/confluent_server/confluent/collective/manager.py index 33b87323..09961698 100644 --- a/confluent_server/confluent/collective/manager.py +++ b/confluent_server/confluent/collective/manager.py @@ -35,7 +35,6 @@ except ImportError: crypto = None currentleader = None -cfginitlock = None follower = None retrythread = None @@ -54,10 +53,7 @@ leader_init = ContextBool() def connect_to_leader(cert=None, name=None, leader=None): global currentleader - global cfginitlock global follower - if cfginitlock is None: - cfginitlock = threading.RLock() if leader is None: leader = currentleader log.log({'info': 'Attempting connection to leader {0}'.format(leader), @@ -70,7 +66,7 @@ def connect_to_leader(cert=None, name=None, leader=None): 'subsystem': 'collective'}) return False with connecting: - with cfginitlock: + with cfm._initlock: tlvdata.recv(remote) # the banner tlvdata.recv(remote) # authpassed... 0.. if name is None: @@ -520,13 +516,10 @@ def become_leader(connection): def startup(): - global cfginitlock members = list(cfm.list_collective()) if len(members) < 2: # Not in collective mode, return return - if cfginitlock is None: - cfginitlock = threading.RLock() eventlet.spawn_n(start_collective) def start_collective(): diff --git a/confluent_server/confluent/config/configmanager.py b/confluent_server/confluent/config/configmanager.py index e9258eb4..0e105c29 100644 --- a/confluent_server/confluent/config/configmanager.py +++ b/confluent_server/confluent/config/configmanager.py @@ -82,6 +82,7 @@ _dirtylock = threading.RLock() _leaderlock = gthread.RLock() _synclock = threading.RLock() _rpclock = gthread.RLock() +_initlock = gthread.RLock() _followerlocks = {} _config_areas = ('nodegroups', 'nodes', 'usergroups', 'users') tracelog = None @@ -953,38 +954,39 @@ class ConfigManager(object): def __init__(self, tenant, decrypt=False, username=None): global _cfgstore - if _cfgstore is None: - init() - self.decrypt = decrypt - self.current_user = username - if tenant is None: - self.tenant = None - if 'main' not in _cfgstore: - _cfgstore['main'] = {} + with _initlock: + if _cfgstore is None: + init() + self.decrypt = decrypt + self.current_user = username + if tenant is None: + self.tenant = None + if 'main' not in _cfgstore: + _cfgstore['main'] = {} + self._bg_sync_to_file() + self._cfgstore = _cfgstore['main'] + if 'nodegroups' not in self._cfgstore: + self._cfgstore['nodegroups'] = {'everything': {'nodes': set()}} + _mark_dirtykey('nodegroups', 'everything', self.tenant) + self._bg_sync_to_file() + if 'nodes' not in self._cfgstore: + self._cfgstore['nodes'] = {} + self._bg_sync_to_file() + return + elif 'tenant' not in _cfgstore: + _cfgstore['tenant'] = {tenant: {}} self._bg_sync_to_file() - self._cfgstore = _cfgstore['main'] + elif tenant not in _cfgstore['tenant']: + _cfgstore['tenant'][tenant] = {} + self._bg_sync_to_file() + self.tenant = tenant + self._cfgstore = _cfgstore['tenant'][tenant] if 'nodegroups' not in self._cfgstore: - self._cfgstore['nodegroups'] = {'everything': {'nodes': set()}} + self._cfgstore['nodegroups'] = {'everything': {}} _mark_dirtykey('nodegroups', 'everything', self.tenant) - self._bg_sync_to_file() if 'nodes' not in self._cfgstore: self._cfgstore['nodes'] = {} - self._bg_sync_to_file() - return - elif 'tenant' not in _cfgstore: - _cfgstore['tenant'] = {tenant: {}} self._bg_sync_to_file() - elif tenant not in _cfgstore['tenant']: - _cfgstore['tenant'][tenant] = {} - self._bg_sync_to_file() - self.tenant = tenant - self._cfgstore = _cfgstore['tenant'][tenant] - if 'nodegroups' not in self._cfgstore: - self._cfgstore['nodegroups'] = {'everything': {}} - _mark_dirtykey('nodegroups', 'everything', self.tenant) - if 'nodes' not in self._cfgstore: - self._cfgstore['nodes'] = {} - self._bg_sync_to_file() def get_collective_member(self, name): return get_collective_member(name) From 10cb1b77dd1e8ebd9b1dc9068da763afbd6a3e84 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Tue, 2 Oct 2018 14:36:07 -0400 Subject: [PATCH 12/20] Extend the nodeboot man page --- confluent_client/doc/man/nodeboot.ronn | 33 ++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/confluent_client/doc/man/nodeboot.ronn b/confluent_client/doc/man/nodeboot.ronn index 9b067556..c54d67f7 100644 --- a/confluent_client/doc/man/nodeboot.ronn +++ b/confluent_client/doc/man/nodeboot.ronn @@ -4,12 +4,41 @@ nodeboot(8) -- Reboot a confluent node to a specific device ## SYNOPSIS `nodeboot ` -`nodeboot ` [net|setup] +`nodeboot [options] ` [default|cd|network|setup|hd] ## DESCRIPTION **nodeboot** reboots nodes in a noderange. If an additional argument is given, -it sets the node to specifically boot to that as the next boot. +it sets the node to specifically boot to that as the next boot. This +performs an immediate reboot without waiting for the OS. To set the boot +device without inducing a reboot, see the `nodesetboot` command. + +## OPTIONS + +* `-b`, `--bios`: + For a system that supports both BIOS and UEFI style boot, request BIOS style + boot if supported (some platforms will UEFI boot with this flag anyway). + +* `-p`, `--persist`: + For a system that supports it, mark the boot override to persist rather than + be a one time change. Many systems do not support this functionality. + +* `default`: + Request a normal default boot with no particular device override + +* `cd`: + Request boot from media. Note that this can include physical CD, + remote media mounted as CD/DVD, and detachable hard disks drives such as usb + key devices. + +* `network`: + Request boot to network + +* `setup`: + Request to enter the firmware configuration menu (e.g. F1 setup) on next boot. + +* `hd`: + Boot straight to hard disk drive ## EXAMPLES * Booting n3 and n4 to the default boot behavior: From 15dc4937eef9191652c784750d61a3a13d029706 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Tue, 2 Oct 2018 14:44:25 -0400 Subject: [PATCH 13/20] Add hyphen options to various completion --- confluent_client/confluent_env.sh | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/confluent_client/confluent_env.sh b/confluent_client/confluent_env.sh index f81bf3df..685b685a 100644 --- a/confluent_client/confluent_env.sh +++ b/confluent_client/confluent_env.sh @@ -51,8 +51,8 @@ _confluent_get_args() _confluent_nodeidentify_completion() { _confluent_get_args - if [ $NUMARGS == 3 ]; then - COMPREPLY=($(compgen -W "on off" -- ${COMP_WORDS[-1]})) + if [ $NUMARGS -ge 3 ]; then + COMPREPLY=($(compgen -W "on off -h" -- ${COMP_WORDS[-1]})) fi if [ $NUMARGS -lt 3 ]; then _confluent_nr_completion @@ -64,8 +64,8 @@ _confluent_nodeidentify_completion() _confluent_nodesetboot_completion() { _confluent_get_args - if [ $NUMARGS == 3 ]; then - COMPREPLY=($(compgen -W "default cd network setup hd" -- ${COMP_WORDS[-1]})) + if [ $NUMARGS -ge 3 ]; then + COMPREPLY=($(compgen -W "default cd network setup hd -h -b -p" -- ${COMP_WORDS[-1]})) fi if [ $NUMARGS -lt 3 ]; then _confluent_nr_completion @@ -76,12 +76,8 @@ _confluent_nodesetboot_completion() _confluent_nodepower_completion() { _confluent_get_args - if [ "${CMPARGS[-1]:0:1}" == '-' ]; then - COMPREPLY=($(compgen -W "-h -p" -- ${COMP_WORDS[-1]})) - return - fi - if [ $NUMARGS == 3 ]; then - COMPREPLY=($(compgen -W "boot off on status" -- ${COMP_WORDS[-1]})) + if [ $NUMARGS -ge 3 ]; then + COMPREPLY=($(compgen -W "boot off on status -h -p" -- ${COMP_WORDS[-1]})) return; fi if [ $NUMARGS -lt 3 ]; then @@ -93,14 +89,14 @@ _confluent_nodepower_completion() _confluent_nodemedia_completion() { _confluent_get_args - if [ $NUMARGS == 3 ]; then - COMPREPLY=($(compgen -W "list upload attach detachall" -- ${COMP_WORDS[-1]})) - return; - fi - if [ $NUMARGS -gt 3 ] && [ ${CMPARGS[2]} == 'upload' ]; then + if [ $NUMARGS -gt 3 ] && [ ${CMPARGS[-2]} == 'upload' ]; then compopt -o default COMPREPLY=() return + fi + if [ $NUMARGS --ge 3 ]; then + COMPREPLY=($(compgen -W "list upload attach detachall -h" -- ${COMP_WORDS[-1]})) + return; fi if [ $NUMARGS -lt 3 ]; then _confluent_nr_completion From 2f616d4586df522df7834aada87ff84e149ca073 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Wed, 3 Oct 2018 16:23:20 -0400 Subject: [PATCH 14/20] Better error when collective.manager is set to something invalid If the collective.manager field does not correspond to any collective manager, give a useful error rather than unexpected error. --- confluent_server/confluent/core.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/confluent_server/confluent/core.py b/confluent_server/confluent/core.py index bb0fb72b..cc469661 100644 --- a/confluent_server/confluent/core.py +++ b/confluent_server/confluent/core.py @@ -859,8 +859,16 @@ def dispatch_request(nodes, manager, element, configmanager, inputdata, certfile='/etc/confluent/srvcert.pem') except Exception: for node in nodes: - yield msg.ConfluentResourceUnavailable( - node, 'Collective member {0} is unreachable'.format(a['name'])) + if a: + yield msg.ConfluentResourceUnavailable( + node, 'Collective member {0} is unreachable'.format( + a['name'])) + else: + yield msg.ConfluentResourceUnavailable( + node, + '"{0}" is not recognized as a collective member'.format( + manager)) + return if not util.cert_matches(a['fingerprint'], remote.getpeercert( binary_form=True)): From 32602fbba3ae6d344fd7ba97d9774eedec8830d9 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 4 Oct 2018 10:23:55 -0400 Subject: [PATCH 15/20] Provide interactive handling of key mismatch in ssh sessions Before, ssh would close without so much as a warning, fix this by dealing with the key data. --- confluent_server/confluent/exceptions.py | 1 + .../confluent/plugins/shell/ssh.py | 39 ++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/confluent_server/confluent/exceptions.py b/confluent_server/confluent/exceptions.py index a2c67b68..80f5337e 100644 --- a/confluent_server/confluent/exceptions.py +++ b/confluent_server/confluent/exceptions.py @@ -106,6 +106,7 @@ class PubkeyInvalid(ConfluentException): super(PubkeyInvalid, self).__init__(self, text) self.fingerprint = fingerprint self.attrname = attribname + self.message = text bodydata = {'message': text, 'event': event, 'fingerprint': fingerprint, diff --git a/confluent_server/confluent/plugins/shell/ssh.py b/confluent_server/confluent/plugins/shell/ssh.py index 9f693b30..2e44e9f8 100644 --- a/confluent_server/confluent/plugins/shell/ssh.py +++ b/confluent_server/confluent/plugins/shell/ssh.py @@ -1,6 +1,6 @@ # vim: tabstop=4 shiftwidth=4 softtabstop=4 -# Copyright 2015 Lenovo +# Copyright 2015-2018 Lenovo # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -113,13 +113,48 @@ class SshShell(conapi.Console): self.password = '' self.datacallback('\r\nlogin as: ') return + except cexc.PubkeyInvalid as pi: + self.keyaction = '' + self.candidatefprint = pi.fingerprint + self.datacallback(pi.message) + self.keyattrname = pi.attrname + self.datacallback('\r\nNew fingerprint: ' + pi.fingerprint) + self.inputmode = -1 + self.datacallback('\r\nEnter "disconnect" or "accept": ') + return self.inputmode = 2 self.connected = True self.shell = self.ssh.invoke_shell() self.rxthread = eventlet.spawn(self.recvdata) def write(self, data): - if self.inputmode == 0: + if self.inputmode == -1: + while len(data) and data[0] == b'\x7f' and len(self.keyaction): + self.datacallback('\b \b') # erase previously echoed value + self.keyaction = self.keyaction[:-1] + data = data[1:] + while len(data) and data[0] == b'\x7f': + data = data[1:] + while b'\x7f' in data: + delidx = data.index(b'\x7f') + data = data[:delidx - 1] + data[delidx + 1:] + self.keyaction += data + if '\r' in self.keyaction: + action = self.keyaction.split('\r')[0] + if action.lower() == 'accept': + self.nodeconfig.set_node_attributes( + {self.node: + {self.keyattrname: self.candidatefprint}}) + self.datacallback('\r\n') + self.logon() + elif action.lower() == 'disconnect': + self.datacallback(conapi.ConsoleEvent.Disconnect) + else: + self.keyaction = '' + self.datacallback('\r\nEnter "disconnect" or "accept": ') + elif len(data) > 0: + self.datacallback(data) + elif self.inputmode == 0: while len(data) and data[0] == b'\x7f' and len(self.username): self.datacallback('\b \b') # erase previously echoed value self.username = self.username[:-1] From 8d9a0827392a86f29e35ab8724d6a828c56c04a9 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 4 Oct 2018 14:59:25 -0400 Subject: [PATCH 16/20] Provide better exceptions and propogate them to client on snmp When doing snmp, messages would always go to log only, even if the user was at the confetty cli. Give user access to knowing the error impacting the query. --- confluent_server/confluent/networking/lldp.py | 38 ++++++++++++------- confluent_server/confluent/snmputil.py | 13 +++++-- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/confluent_server/confluent/networking/lldp.py b/confluent_server/confluent/networking/lldp.py index a54e7b55..4df205bb 100644 --- a/confluent_server/confluent/networking/lldp.py +++ b/confluent_server/confluent/networking/lldp.py @@ -171,7 +171,7 @@ def _extract_neighbor_data_b(args): args are carried as a tuple, because of eventlet convenience """ - switch, password, user, force = args + switch, password, user, force = args[:4] vintage = _neighdata.get(switch, {}).get('!!vintage', 0) now = util.monotonic_time() if vintage > (now - 60) and not force: @@ -220,17 +220,19 @@ def _extract_neighbor_data_b(args): _neighdata[switch] = lldpdata -def update_switch_data(switch, configmanager, force=False): +def update_switch_data(switch, configmanager, force=False, retexc=False): switchcreds = netutil.get_switchcreds(configmanager, (switch,))[0] - _extract_neighbor_data(switchcreds + (force,)) + ndr = _extract_neighbor_data(switchcreds + (force, retexc)) + if retexc and isinstance(ndr, Exception): + raise ndr return _neighdata.get(switch, {}) -def update_neighbors(configmanager, force=False): - return _update_neighbors_backend(configmanager, force) +def update_neighbors(configmanager, force=False, retexc=False): + return _update_neighbors_backend(configmanager, force, retexc) -def _update_neighbors_backend(configmanager, force): +def _update_neighbors_backend(configmanager, force, retexc): global _neighdata global _neighbypeerid vintage = _neighdata.get('!!vintage', 0) @@ -241,7 +243,7 @@ def _update_neighbors_backend(configmanager, force): _neighbypeerid = {'!!vintage': now} switches = netutil.list_switches(configmanager) switchcreds = netutil.get_switchcreds(configmanager, switches) - switchcreds = [ x + (force,) for x in switchcreds] + switchcreds = [ x + (force, retexc) for x in switchcreds] pool = GreenPool(64) for ans in pool.imap(_extract_neighbor_data, switchcreds): yield ans @@ -258,9 +260,15 @@ def _extract_neighbor_data(args): return try: with _updatelocks[switch]: - _extract_neighbor_data_b(args) - except Exception: - log.logtrace() + return _extract_neighbor_data_b(args) + except Exception as e: + yieldexc = False + if len(args) >= 5: + yieldexc = args[4] + if yieldexc: + return e + else: + log.logtrace() if __name__ == '__main__': # a quick one-shot test, args are switch and snmpv1 string for now @@ -327,7 +335,9 @@ def _handle_neighbor_query(pathcomponents, configmanager): # guaranteed if (parms['by-peerid'] not in _neighbypeerid and _neighbypeerid.get('!!vintage', 0) < util.monotonic_time() - 60): - list(update_neighbors(configmanager)) + for x in update_neighbors(configmanager, retexc=True): + if isinstance(x, Exception): + raise x if parms['by-peerid'] not in _neighbypeerid: raise exc.NotFoundException('No matching peer known') return _dump_neighbordatum(_neighbypeerid[parms['by-peerid']]) @@ -336,9 +346,11 @@ def _handle_neighbor_query(pathcomponents, configmanager): if listrequested not in multi_selectors | single_selectors: raise exc.NotFoundException('{0} is not found'.format(listrequested)) if 'by-switch' in parms: - update_switch_data(parms['by-switch'], configmanager) + update_switch_data(parms['by-switch'], configmanager, retexc=True) else: - list(update_neighbors(configmanager)) + for x in update_neighbors(configmanager, retexc=True): + if isinstance(x, Exception): + raise x return list_info(parms, listrequested) diff --git a/confluent_server/confluent/snmputil.py b/confluent_server/confluent/snmputil.py index ae6c39b6..e862afe8 100644 --- a/confluent_server/confluent/snmputil.py +++ b/confluent_server/confluent/snmputil.py @@ -92,12 +92,17 @@ class Session(object): errstr, errnum, erridx, answers = rsp if errstr: errstr = str(errstr) - if errstr in ('unknownUserName', 'wrongDigest'): - raise exc.TargetEndpointBadCredentials(errstr) + finerr = errstr + ' while trying to connect to ' \ + '{0}'.format(self.server) + if errstr in ('Unknown USM user', 'unknownUserName', + 'wrongDigest', 'Wrong SNMP PDU digest'): + raise exc.TargetEndpointBadCredentials(finerr) # need to do bad credential versus timeout - raise exc.TargetEndpointUnreachable(errstr) + raise exc.TargetEndpointUnreachable(finerr) elif errnum: - raise exc.ConfluentException(errnum.prettyPrint()) + raise exc.ConfluentException(errnum.prettyPrint() + + ' while trying to connect to ' + '{0}'.format(self.server)) for ans in answers: if not obj[0].isPrefixOf(ans[0]): # PySNMP returns leftovers in a bulk command From 73c06fd25eae89d26c7bfeeb802bcd032034c8c2 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 8 Oct 2018 09:54:03 -0400 Subject: [PATCH 17/20] Fix display of error on join of collective --- confluent_server/bin/collective | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/confluent_server/bin/collective b/confluent_server/bin/collective index a6465ef4..5f0c78f6 100644 --- a/confluent_server/bin/collective +++ b/confluent_server/bin/collective @@ -66,8 +66,9 @@ def join_collective(server, invitation): 'invitation': invitation, 'server': server}}) res = tlvdata.recv(s) - print(res.get('collective', - {'status': 'Unknown response: ' + repr(res)})['status']) + res = res.get('collective', + {'status': 'Unknown response: ' + repr(res)}) + print(res.get('status', res['error'])) def show_collective(): From 5baab5bef465d38cd247232a00dafe87b6312c99 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 8 Oct 2018 10:47:38 -0400 Subject: [PATCH 18/20] Add more stateful to completion Allow it to sense words already used in command. Refactor to common code for similar ones. --- confluent_client/confluent_env.sh | 56 +++++++++++++++++++------------ 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/confluent_client/confluent_env.sh b/confluent_client/confluent_env.sh index 685b685a..3312002c 100644 --- a/confluent_client/confluent_env.sh +++ b/confluent_client/confluent_env.sh @@ -45,57 +45,69 @@ _confluent_get_args() NUMARGS=${#CMPARGS[@]} if [ "${COMP_WORDS[-1]}" == '' ]; then NUMARGS=$((NUMARGS+1)) + CMPARGS+=("") fi + GENNED="" + for CAND in ${COMP_CANDIDATES[@]}; do + candarray=(${CAND//,/ }) + matched=0 + for c in "${candarray[@]}"; do + for arg in "${CMPARGS[@]}"; do + if [ "$arg" = "$c" ]; then + matched=1 + break + fi + done + done + if [ 0 = $matched ]; then + for c in "${candarray[@]}"; do + GENNED+=" $c" + done + fi + done } -_confluent_nodeidentify_completion() +function _confluent_generic_completion() { _confluent_get_args - if [ $NUMARGS -ge 3 ]; then - COMPREPLY=($(compgen -W "on off -h" -- ${COMP_WORDS[-1]})) + if [ $NUMARGS -ge 3 ] && [ ! -z "$GENNED" ]; then + COMPREPLY=($(compgen -W "$GENNED" -- ${COMP_WORDS[-1]})) fi if [ $NUMARGS -lt 3 ]; then _confluent_nr_completion return; fi } +_confluent_nodeidentify_completion() +{ + COMP_CANDIDATES=("on,off -h") + _confluent_generic_completion +} _confluent_nodesetboot_completion() { - _confluent_get_args - if [ $NUMARGS -ge 3 ]; then - COMPREPLY=($(compgen -W "default cd network setup hd -h -b -p" -- ${COMP_WORDS[-1]})) - fi - if [ $NUMARGS -lt 3 ]; then - _confluent_nr_completion - return; - fi + COMP_CANDIDATES=("default,cd,network,setup,hd -h -b -p") + _confluent_generic_completion } _confluent_nodepower_completion() { - _confluent_get_args - if [ $NUMARGS -ge 3 ]; then - COMPREPLY=($(compgen -W "boot off on status -h -p" -- ${COMP_WORDS[-1]})) - return; - fi - if [ $NUMARGS -lt 3 ]; then - _confluent_nr_completion - return; - fi + COMP_CANDIDATES=("boot,off,on,status -h -p") + _confluent_generic_completion } _confluent_nodemedia_completion() { + COMP_CANDIDATES=("list,upload,attach,detachall -h") _confluent_get_args if [ $NUMARGS -gt 3 ] && [ ${CMPARGS[-2]} == 'upload' ]; then compopt -o default COMPREPLY=() return fi - if [ $NUMARGS --ge 3 ]; then - COMPREPLY=($(compgen -W "list upload attach detachall -h" -- ${COMP_WORDS[-1]})) + if [ $NUMARGS -ge 3 ] && [ ! -z "$GENNED" ]; then + COMPREPLY=($(compgen -W "$GENNED" -- ${COMP_WORDS[-1]})) return; fi if [ $NUMARGS -lt 3 ]; then From c6b8aaf372356a70fa654686ba0bc6160346de6a Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 8 Oct 2018 10:57:48 -0400 Subject: [PATCH 19/20] Fix mistake in the nodegroupdefine man page --- confluent_client/doc/man/nodegroupdefine.ronn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/confluent_client/doc/man/nodegroupdefine.ronn b/confluent_client/doc/man/nodegroupdefine.ronn index cf86c46c..9b2aba8d 100644 --- a/confluent_client/doc/man/nodegroupdefine.ronn +++ b/confluent_client/doc/man/nodegroupdefine.ronn @@ -7,7 +7,7 @@ nodegroupdefine(8) -- Define new confluent node group ## DESCRIPTION -`nodegroupdefine` allows the definition of a new node for the confluent management +`nodegroupdefine` allows the definition of a new nodegroup for the confluent management service. It may only define a single group name at a time. It has the same syntax as `nodegroupattrib(8)`, and the commands differ in that `nodegroupattrib(8)` will error if a node group does not exist. From 20a37f8db53cc5c6b57d32b124274c3b9961b612 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Mon, 8 Oct 2018 11:02:33 -0400 Subject: [PATCH 20/20] Add mention of the -u options to the manpages. --- confluent_client/doc/man/nodeboot.ronn | 3 +++ confluent_client/doc/man/nodesetboot.ronn | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/confluent_client/doc/man/nodeboot.ronn b/confluent_client/doc/man/nodeboot.ronn index c54d67f7..36804b63 100644 --- a/confluent_client/doc/man/nodeboot.ronn +++ b/confluent_client/doc/man/nodeboot.ronn @@ -19,6 +19,9 @@ device without inducing a reboot, see the `nodesetboot` command. For a system that supports both BIOS and UEFI style boot, request BIOS style boot if supported (some platforms will UEFI boot with this flag anyway). +* `-u`, `--uefi`: + This flag does nothing, it is for command compatibility with xCAT's rsetboot + * `-p`, `--persist`: For a system that supports it, mark the boot override to persist rather than be a one time change. Many systems do not support this functionality. diff --git a/confluent_client/doc/man/nodesetboot.ronn b/confluent_client/doc/man/nodesetboot.ronn index 0fc881e1..dce303b1 100644 --- a/confluent_client/doc/man/nodesetboot.ronn +++ b/confluent_client/doc/man/nodesetboot.ronn @@ -30,7 +30,10 @@ control. * `-p`, `--persist`: For a system that supports it, mark the boot override to persist rather than be a one time change. Many systems do not support this functionality. - + +* `-u`, `--uefi`: + This flag does nothing, it is for command compatibility with xCAT's rsetboot + * `default`: Request a normal default boot with no particular device override