2
0
mirror of https://github.com/xcat2/confluent.git synced 2025-04-25 06:25:54 +00:00
2020-09-14 08:03:18 -04:00

1319 lines
52 KiB
Python

# Copyright 2016-2017 Lenovo
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This manages the detection and auto-configuration of nodes.
# Discovery sources may implement scans and may be passive or may provide
# both.
# The phases and actions:
# - Detect - Notice the existance of a potentially supported target
# - Potentially apply a secure replacement for default credential
# (perhaps using some key identifier combined with some string
# denoting temporary use, and use confluent master integrity key
# to generate a password in a formulaic way?)
# - Do some universal reconfiguration if applicable (e.g. if something is
# part of an enclosure with an optionally enabled enclosure manager,
# check and request enclosure manager enablement
# - Throughout all of this, at this phase no sensitive data is divulged,
# only using credentials that are factory default or equivalent to
# factory default
# - Request transition to Locate
# - Locate - Use available cues to ascertain the physical location. This may
# be mac address lookup through switch or correlated by a server
# enclosure manager. If the location data suggests a node identity,
# then proceed to the 'verify' state
# - Verify - Given the current information and candidate upstream verifier,
# verify the authenticity of the servers claim in an automated way
# if possible. A few things may happen at this juncture
# - Verification outright fails (confirmed negative response)
# - Audit log entry created, element is not *allowed* to
# proceed
# - Verification not possible (neither good or bad)
# - If security policy is set to low, proceed to 'Manage'
# - Otherwise, log the detection event and stop (user
# would then manually bless the endpoint if applicable
# - Verification succeeds
# - If security policy is set to strict (or manual, whichever
# word works best, note the successfull verification, but
# do not manage
# - Otherwise, proceed to 'Manage'
# -Pre-configure - Given data up to this point, try to do some pre-config.
# For example, if located and X, then check for S, enable S
# This happens regardless of verify, as verify may depend on
# S
# - Manage
# - Create the node if autonode (Deferred)
# - If there is not a defined ip address, collect the current LLA and use
# that value.
# - If no username/password defined, generate a unique password, 20 bytes
# long, written to pass most complexity rules (15 random bytes, base64,
# retry until uppercase, lowercase, digit, and symbol all present)
# - Apply defined configuration to endpoint
import base64
import confluent.config.configmanager as cfm
import confluent.collective.manager as collective
import confluent.discovery.protocols.pxe as pxe
import confluent.discovery.protocols.ssdp as ssdp
import confluent.discovery.protocols.slp as slp
import confluent.discovery.handlers.imm as imm
import confluent.discovery.handlers.cpstorage as cpstorage
import confluent.discovery.handlers.tsm as tsm
import confluent.discovery.handlers.pxe as pxeh
import confluent.discovery.handlers.smm as smm
import confluent.discovery.handlers.xcc as xcc
import confluent.exceptions as exc
import confluent.log as log
import confluent.messages as msg
import confluent.networking.macmap as macmap
import confluent.noderange as noderange
import confluent.util as util
import eventlet
import traceback
import socket as nsocket
webclient = eventlet.import_patched('pyghmi.util.webclient')
import eventlet
import eventlet.greenpool
import eventlet.semaphore
autosensors = set()
scanner = None
try:
unicode
except NameError:
unicode = str
class nesteddict(dict):
def __missing__(self, key):
v = self[key] = nesteddict()
return v
nodehandlers = {
'service:lenovo-smm': smm,
'service:lenovo-smm2': smm,
'service:management-hardware.Lenovo:lenovo-xclarity-controller': xcc,
'service:management-hardware.IBM:integrated-management-module2': imm,
'pxe-client': pxeh,
'onie-switch': None,
'cumulus-switch': None,
'service:io-device.Lenovo:management-module': None,
'service:thinkagile-storage': cpstorage,
'service:lenovo-tsm': tsm,
}
servicenames = {
'pxe-client': 'pxe-client',
'onie-switch': 'onie-switch',
'cumulus-switch': 'cumulus-switch',
'service:lenovo-smm': 'lenovo-smm',
'service:lenovo-smm2': 'lenovo-smm2',
'service:management-hardware.Lenovo:lenovo-xclarity-controller': 'lenovo-xcc',
'service:management-hardware.IBM:integrated-management-module2': 'lenovo-imm2',
'service:io-device.Lenovo:management-module': 'lenovo-switch',
'service:thinkagile-storage': 'thinkagile-storagebmc',
'service:lenovo-tsm': 'lenovo-tsm',
}
servicebyname = {
'pxe-client': 'pxe-client',
'onie-switch': 'onie-switch',
'cumulus-switch': 'cumulus-switch',
'lenovo-smm': 'service:lenovo-smm',
'lenovo-smm2': 'service:lenovo-smm2',
'lenovo-xcc': 'service:management-hardware.Lenovo:lenovo-xclarity-controller',
'lenovo-imm2': 'service:management-hardware.IBM:integrated-management-module2',
'lenovo-switch': 'service:io-device.Lenovo:management-module',
'thinkagile-storage': 'service:thinkagile-storagebmc',
'lenovo-tsm': 'service:lenovo-tsm',
}
discopool = eventlet.greenpool.GreenPool(500)
runningevals = {}
# Passive-only auto-detection protocols:
# PXE
# Both passive and active
# SLP (passive mode listens for SLP DA and unicast interrogation of the system)
# mDNS
# SSD
# Also there are location providers
# Switch
# chassis
# chassis may in turn describe more chassis
# We normalize discovered node data to the following pieces of information:
# * Detected node name (if available, from switch discovery or similar or
# auto generated node name.
# * Model number
# * Model name
# * Serial number
# * System UUID (in x86 space, specifically whichever UUID would be in DMI)
# * Network interfaces and addresses
# * Switch connectivity information
# * enclosure information
# * Management TLS fingerprint if validated (switch publication or enclosure)
# * System TLS fingerprint if validated (switch publication or system manager)
#TODO: by serial, by uuid, by node
known_info = {}
known_services = {}
known_serials = {}
known_uuids = nesteddict()
known_nodes = nesteddict()
unknown_info = {}
pending_nodes = {}
pending_by_uuid = {}
def enrich_pxe_info(info):
sn = None
mn = None
nodename = info.get('nodename', None)
uuid = info.get('uuid', '')
if not uuid_is_valid(uuid):
return info
for mac in known_uuids.get(uuid, {}):
if not sn and 'serialnumber' in known_uuids[uuid][mac]:
info['serialnumber'] = known_uuids[uuid][mac]['serialnumber']
if not mn and 'modelnumber' in known_uuids[uuid][mac]:
info['modelnumber'] = known_uuids[uuid][mac]['modelnumber']
if nodename is None and 'nodename' in known_uuids[uuid][mac]:
info['nodename'] = known_uuids[uuid][mac]['nodename']
def uuid_is_valid(uuid):
if not uuid:
return False
return uuid.lower() not in ('00000000-0000-0000-0000-000000000000',
'ffffffff-ffff-ffff-ffff-ffffffffffff',
'00112233-4455-6677-8899-aabbccddeeff',
'20202020-2020-2020-2020-202020202020')
def _printable_ip(sa):
return nsocket.getnameinfo(
sa, nsocket.NI_NUMERICHOST|nsocket.NI_NUMERICSERV)[0]
def send_discovery_datum(info):
addresses = info.get('addresses', [])
if info['handler'] == pxeh:
enrich_pxe_info(info)
yield msg.KeyValueData({'nodename': info.get('nodename', '')})
yield msg.KeyValueData({'ipaddrs': [_printable_ip(x) for x in addresses]})
sn = info.get('serialnumber', '')
mn = info.get('modelnumber', '')
uuid = info.get('uuid', '')
if uuid:
relatedmacs = []
for mac in known_uuids.get(uuid, {}):
if mac and mac != info.get('hwaddr', ''):
relatedmacs.append(mac)
if relatedmacs:
yield msg.KeyValueData({'relatedmacs': relatedmacs})
yield msg.KeyValueData({'serialnumber': sn})
yield msg.KeyValueData({'modelnumber': mn})
yield msg.KeyValueData({'uuid': uuid})
if 'enclosure.bay' in info:
yield msg.KeyValueData({'bay': int(info['enclosure.bay'])})
yield msg.KeyValueData({'macs': [info.get('hwaddr', '')]})
types = []
for infotype in info.get('services', []):
if infotype in servicenames:
types.append(servicenames[infotype])
yield msg.KeyValueData({'types': types})
if 'otheraddresses' in info:
yield msg.KeyValueData({'otheripaddrs': list(info['otheraddresses'])})
def _info_matches(info, criteria):
model = criteria.get('by-model', None)
devtype = criteria.get('by-type', None)
node = criteria.get('by-node', None)
serial = criteria.get('by-serial', None)
status = criteria.get('by-state', None)
uuid = criteria.get('by-uuid', None)
if model and info.get('modelnumber', None) != model:
return False
if devtype and devtype not in info.get('services', []):
return False
if node and info.get('nodename', None) != node:
return False
if serial and info.get('serialnumber', None) != serial:
return False
if status and info.get('discostatus', None) != status:
return False
if uuid and info.get('uuid', None) != uuid:
return False
return True
def list_matching_nodes(criteria):
retnodes = []
for node in known_nodes:
for mac in known_nodes[node]:
info = known_info[mac]
if _info_matches(info, criteria):
retnodes.append(node)
break
retnodes.sort(key=noderange.humanify_nodename)
return [msg.ChildCollection(node + '/') for node in retnodes]
def list_matching_serials(criteria):
for serial in sorted(list(known_serials)):
info = known_serials[serial]
if _info_matches(info, criteria):
yield msg.ChildCollection(serial + '/')
def list_matching_uuids(criteria):
for uuid in sorted(list(known_uuids)):
for mac in known_uuids[uuid]:
info = known_uuids[uuid][mac]
if _info_matches(info, criteria):
yield msg.ChildCollection(uuid + '/')
break
def list_matching_states(criteria):
return [msg.ChildCollection(x) for x in ('discovered/', 'identified/',
'unidentified/')]
def list_matching_macs(criteria):
for mac in sorted(list(known_info)):
info = known_info[mac]
if _info_matches(info, criteria):
yield msg.ChildCollection(mac.replace(':', '-'))
def list_matching_types(criteria):
rettypes = []
for infotype in known_services:
typename = servicenames[infotype]
if ('by-model' not in criteria or
criteria['by-model'] in known_services[infotype]):
rettypes.append(typename)
return [msg.ChildCollection(typename + '/')
for typename in sorted(rettypes)]
def list_matching_models(criteria):
for model in sorted(list(detected_models())):
if ('by-type' not in criteria or
model in known_services[criteria['by-type']]):
yield msg.ChildCollection(model + '/')
def show_info(mac):
mac = mac.replace('-', ':')
if mac not in known_info:
raise exc.NotFoundException(mac + ' not a known mac address')
for i in send_discovery_datum(known_info[mac]):
yield i
list_info = {
'by-node': list_matching_nodes,
'by-serial': list_matching_serials,
'by-type': list_matching_types,
'by-model': list_matching_models,
'by-mac': list_matching_macs,
'by-state': list_matching_states,
'by-uuid': list_matching_uuids,
}
multi_selectors = set([
'by-type',
'by-model',
'by-state',
'by-uuid',
])
node_selectors = set([
'by-node',
'by-serial',
])
single_selectors = set([
'by-mac',
])
def _parameterize_path(pathcomponents):
listrequested = False
childcoll = True
if len(pathcomponents) % 2 == 1:
listrequested = pathcomponents[-1]
pathcomponents = pathcomponents[:-1]
pathit = iter(pathcomponents)
keyparams = {}
validselectors = multi_selectors | node_selectors | single_selectors
for key, val in zip(pathit, pathit):
if key not in validselectors:
raise exc.NotFoundException('{0} is not valid here'.format(key))
if key == 'by-type':
keyparams[key] = servicebyname.get(val, '!!!!invalid-type')
else:
keyparams[key] = val
validselectors.discard(key)
if key in single_selectors:
childcoll = False
validselectors = set([])
elif key in node_selectors:
validselectors = single_selectors | set([])
return validselectors, keyparams, listrequested, childcoll
def handle_autosense_config(operation, inputdata):
autosense = cfm.get_global('discovery.autosense')
autosense = autosense or autosense is None
if operation == 'retrieve':
yield msg.KeyValueData({'enabled': autosense})
elif operation == 'update':
enabled = inputdata['enabled']
if type(enabled) in (unicode, bytes):
enabled = enabled.lower() in ('true', '1', 'y', 'yes', 'enable',
'enabled')
if autosense == enabled:
return
cfm.set_global('discovery.autosense', enabled)
if enabled:
start_autosense()
else:
stop_autosense()
def handle_api_request(configmanager, inputdata, operation, pathcomponents):
if pathcomponents == ['discovery', 'autosense']:
return handle_autosense_config(operation, inputdata)
if operation == 'retrieve':
return handle_read_api_request(pathcomponents)
elif (operation in ('update', 'create') and
pathcomponents == ['discovery', 'rescan']):
if inputdata != {'rescan': 'start'}:
raise exc.InvalidArgumentException()
rescan()
return (msg.KeyValueData({'rescan': 'started'}),)
elif operation in ('update', 'create'):
if 'node' not in inputdata:
raise exc.InvalidArgumentException('Missing node name in input')
mac = _get_mac_from_query(pathcomponents)
info = known_info[mac]
if info['handler'] is None:
raise exc.NotImplementedException(
'Unable to {0} to {1}'.format(operation,
'/'.join(pathcomponents)))
handler = info['handler'].NodeHandler(info, configmanager)
try:
eval_node(configmanager, handler, info, inputdata['node'],
manual=True)
except Exception as e:
# or... incorrect passworod provided..
if 'Incorrect password' in str(e) or 'Unauthorized name' in str(e):
return [msg.ConfluentTargetInvalidCredentials(
inputdata['node'])]
raise
return [msg.AssignedResource(inputdata['node'])]
elif operation == 'delete':
mac = _get_mac_from_query(pathcomponents)
del known_info[mac]
return [msg.DeletedResource(mac)]
raise exc.NotImplementedException(
'Unable to {0} to {1}'.format(operation, '/'.join(pathcomponents)))
def _get_mac_from_query(pathcomponents):
_, queryparms, _, _ = _parameterize_path(pathcomponents[1:])
if 'by-mac' not in queryparms:
raise exc.InvalidArgumentException('Must target using "by-mac"')
mac = queryparms['by-mac'].replace('-', ':')
if mac not in known_info:
raise exc.NotFoundException('{0} not found'.format(mac))
return mac
def handle_read_api_request(pathcomponents):
# TODO(jjohnson2): This should be more generalized...
# odd indexes into components are 'by-'*, even indexes
# starting at 2 are parameters to previous index
if pathcomponents == ['discovery', 'rescan']:
return (msg.KeyValueData({'scanning': bool(scanner)}),)
subcats, queryparms, indexof, coll = _parameterize_path(pathcomponents[1:])
if len(pathcomponents) == 1:
dirlist = [msg.ChildCollection(x + '/') for x in sorted(list(subcats))]
dirlist.append(msg.ChildCollection('rescan'))
dirlist.append(msg.ChildCollection('autosense'))
return dirlist
if not coll:
return show_info(queryparms['by-mac'])
if not indexof:
return [msg.ChildCollection(x + '/') for x in sorted(list(subcats))]
if indexof not in list_info:
raise exc.NotFoundException('{0} is not found'.format(indexof))
return list_info[indexof](queryparms)
def detected_services():
for srv in known_services:
yield servicenames[srv]
def detected_models():
knownmodels = set([])
for info in known_info:
info = known_info[info]
if 'modelnumber' in info and info['modelnumber'] not in knownmodels:
knownmodels.add(info['modelnumber'])
yield info['modelnumber']
def _recheck_nodes(nodeattribs, configmanager):
if rechecklock.locked():
# if already in progress, don't run again
# it may make sense to schedule a repeat, but will try the easier and less redundant way first
return
with rechecklock:
return _recheck_nodes_backend(nodeattribs, configmanager)
def _recheck_nodes_backend(nodeattribs, configmanager):
global rechecker
_map_unique_ids(nodeattribs)
# for the nodes whose attributes have changed, consider them as potential
# strangers
if nodeattribs:
macmap.vintage = 0 # expire current mac map data, in case
# the attributes changed impacted the result
for node in nodeattribs:
if node in known_nodes:
for somemac in known_nodes[node]:
unknown_info[somemac] = known_nodes[node][somemac]
unknown_info[somemac]['discostatus'] = 'unidentified'
# Now we go through ones we did not find earlier
for mac in list(unknown_info):
try:
_recheck_single_unknown(configmanager, mac)
except Exception:
traceback.print_exc()
continue
# now we go through ones that were identified, but could not pass
# policy or hadn't been able to verify key
for nodename in pending_nodes:
info = pending_nodes[nodename]
try:
if info['handler'] is None:
next
handler = info['handler'].NodeHandler(info, configmanager)
discopool.spawn_n(eval_node, configmanager, handler, info, nodename)
except Exception:
traceback.print_exc()
log.log({'error': 'Unexpected error during discovery of {0}, check debug '
'logs'.format(nodename)})
def _recheck_single_unknown(configmanager, mac):
info = unknown_info.get(mac, None)
_recheck_single_unknown_info(configmanager, info)
def _recheck_single_unknown_info(configmanager, info):
global rechecker
global rechecktime
if not info or info['handler'] is None:
return
if info['handler'] != pxeh and not info.get('addresses', None):
#log.log({'info': 'Missing address information in ' + repr(info)})
return
handler = info['handler'].NodeHandler(info, configmanager)
if handler.https_supported and not handler.https_cert:
if handler.cert_fail_reason == 'unreachable':
log.log(
{
'info': '{0} with hwaddr {1} is not reachable at {2}'
''.format(
handler.devname, info['hwaddr'], handler.ipaddr
)})
# addresses data is bad, delete the offending ip
info['addresses'] = [x for x in info.get('addresses', []) if x != handler.ipaddr]
# TODO(jjohnson2): rescan due to bad peer addr data?
# not just wait around for the next announce
return
log.log(
{
'info': '{0} with hwaddr {1} at address {2} is not yet running '
'https, will examine later'.format(
handler.devname, info['hwaddr'], handler.ipaddr
)})
if rechecker is not None and rechecktime > util.monotonic_time() + 300:
rechecker.cancel()
# if cancel did not result in dead, then we are in progress
if rechecker is None or rechecker.dead:
rechecktime = util.monotonic_time() + 300
rechecker = eventlet.spawn_after(300, _periodic_recheck,
configmanager)
return
nodename, info['maccount'] = get_nodename(configmanager, handler, info)
if nodename:
if handler.https_supported:
dp = configmanager.get_node_attributes([nodename],
('pubkeys.tls_hardwaremanager',))
lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager',
{}).get('value', None)
if util.cert_matches(lastfp, handler.https_cert):
info['nodename'] = nodename
known_nodes[nodename][info['hwaddr']] = info
info['discostatus'] = 'discovered'
return # already known, no need for more
discopool.spawn_n(eval_node, configmanager, handler, info, nodename)
def safe_detected(info):
if 'hwaddr' not in info or not info['hwaddr']:
return
if info['hwaddr'] in runningevals:
# Do not evaluate the same mac multiple times at once
return
runningevals[info['hwaddr']] = discopool.spawn(eval_detected, info)
def eval_detected(info):
try:
detected(info)
except Exception as e:
traceback.print_exc()
del runningevals[info['hwaddr']]
def detected(info):
global rechecker
global rechecktime
# later, manual and CMM discovery may act on SN and/or UUID
for service in info['services']:
if service in nodehandlers:
if service not in known_services:
known_services[service] = set([])
handler = nodehandlers[service]
info['handler'] = handler
break
else: # no nodehandler, ignore for now
return
if (handler and not handler.NodeHandler.adequate(info) and
info.get('protocol', None)):
eventlet.spawn_after(10, info['protocol'].fix_info, info,
safe_detected)
return
try:
snum = info['attributes']['enclosure-serial-number'][0].strip()
if snum:
info['serialnumber'] = snum
known_serials[info['serialnumber']] = info
except (KeyError, IndexError):
pass
try:
info['modelnumber'] = info['attributes']['enclosure-machinetype-model'][0]
known_services[service].add(info['modelnumber'])
except (KeyError, IndexError):
pass
if info['hwaddr'] in known_info and 'addresses' in info:
# we should tee these up for parsing when an enclosure comes up
# also when switch config parameters change, should discard
# and there's also if wiring is fixed...
# of course could periodically revisit known_nodes
# replace potentially stale address info
#TODO(jjohnson2): remove this
# temporary workaround for XCC not doing SLP DA over dedicated port
# bz 93219, fix submitted, but not in builds yet
# strictly speaking, going ipv4 only legitimately is mistreated here,
# but that should be an edge case
oldaddr = known_info[info['hwaddr']].get('addresses', [])
for addr in info['addresses']:
if addr[0].startswith('fe80::'):
break
else:
for addr in oldaddr:
if addr[0].startswith('fe80::'):
info['addresses'].append(addr)
if known_info[info['hwaddr']].get(
'addresses', []) == info['addresses']:
# if the ip addresses match, then assume no changes
# now something resetting to defaults could, in theory
# have the same address, but need to be reset
# in that case, however, a user can clear pubkeys to force a check
return
known_info[info['hwaddr']] = info
cfg = cfm.ConfigManager(None)
if handler:
handler = handler.NodeHandler(info, cfg)
handler.scan()
uuid = info.get('uuid', None)
if uuid_is_valid(uuid):
known_uuids[uuid][info['hwaddr']] = info
info['otheraddresses'] = set([])
for i4addr in info.get('attributes', {}).get('ipv4-address', []):
info['otheraddresses'].add(i4addr)
if handler and handler.https_supported and not handler.https_cert:
if handler.cert_fail_reason == 'unreachable':
log.log(
{
'info': '{0} with hwaddr {1} is not reachable by https '
'at address {2}'.format(
handler.devname, info['hwaddr'], handler.ipaddr
)})
info['addresses'] = [x for x in info.get('addresses', []) if x != handler.ipaddr]
return
log.log(
{'info': '{0} with hwaddr {1} at address {2} is not yet running '
'https, will examine later'.format(
handler.devname, info['hwaddr'], handler.ipaddr
)})
if rechecker is not None and rechecktime > util.monotonic_time() + 300:
rechecker.cancel()
if rechecker is None or rechecker.dead:
rechecktime = util.monotonic_time() + 300
rechecker = eventlet.spawn_after(300, _periodic_recheck, cfg)
unknown_info[info['hwaddr']] = info
info['discostatus'] = 'unidentfied'
#TODO, eventlet spawn after to recheck sooner, or somehow else
# influence periodic recheck to shorten delay?
return
nodename, info['maccount'] = get_nodename(cfg, handler, info)
if nodename and handler and handler.https_supported:
dp = cfg.get_node_attributes([nodename],
('pubkeys.tls_hardwaremanager',))
lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager',
{}).get('value', None)
if util.cert_matches(lastfp, handler.https_cert):
info['nodename'] = nodename
known_nodes[nodename][info['hwaddr']] = info
info['discostatus'] = 'discovered'
return # already known, no need for more
#TODO(jjohnson2): We might have to get UUID for certain searches...
#for now defer probe until inside eval_node. We might not have
#a nodename without probe in the future.
if nodename and handler:
eval_node(cfg, handler, info, nodename)
elif handler:
#log.log(
# {'info': 'Detected unknown {0} with hwaddr {1} at '
# 'address {2}'.format(
# handler.devname, info['hwaddr'], handler.ipaddr
# )})
info['discostatus'] = 'unidentified'
unknown_info[info['hwaddr']] = info
def b64tohex(b64str):
bd = base64.b64decode(b64str)
bd = bytearray(bd)
return ''.join(['{0:02x}'.format(x) for x in bd])
def get_enclosure_chain_head(nodename, cfg):
ne = True
members = [nodename]
while ne:
ne = cfg.get_node_attributes(
nodename, 'enclosure.extends').get(nodename, {}).get(
'enclosure.extends', {}).get('value', None)
if not ne:
return nodename
if ne in members:
raise exc.InvalidArgumentException(
'Circular chain that includes ' + nodename)
if not cfg.is_node(ne):
raise exc.InvalidArgumentException(
'{0} is chained to nonexistent node {1} '.format(
nodename, ne))
nodename = ne
members.append(nodename)
return nodename
def get_chained_smm_name(nodename, cfg, handler, nl=None, checkswitch=True):
# nodename is the head of the chain, cfg is a configmanager, handler
# is the handler of the current candidate, nl is optional indication
# of the next link in the chain, checkswitch can disable the switch
# search if not indicated by current situation
# returns the new name and whether it has been securely validated or not
# first we check to see if directly connected
mycert = handler.https_cert
if checkswitch:
fprints = macmap.get_node_fingerprints(nodename, cfg)
for fprint in fprints:
if util.cert_matches(fprint[0], mycert):
# ok we have a direct match, it is this node
return nodename, fprint[1]
# ok, unable to get it, need to traverse the chain from the beginning
if not nl:
nl = list(cfg.filter_node_attributes(
'enclosure.extends=' + nodename))
while nl:
if len(nl) != 1:
raise exc.InvalidArgumentException('Multiple enclosures trying to '
'extend a single enclosure')
cd = cfg.get_node_attributes(nodename, ['hardwaremanagement.manager',
'pubkeys.tls_hardwaremanager'])
smmaddr = cd[nodename]['hardwaremanagement.manager']['value']
pkey = cd[nodename].get('pubkeys.tls_hardwaremanager', {}).get(
'value', None)
if not pkey:
# We cannot continue through a break in the chain
return None, False
if pkey:
cv = util.TLSCertVerifier(
cfg, nodename, 'pubkeys.tls_hardwaremanager').verify_cert
for fprint in get_smm_neighbor_fingerprints(smmaddr, cv):
if util.cert_matches(fprint, mycert):
# a trusted chain member vouched for the cert
# so it's validated
return nl[0], True
# advance down the chain by one and try again
nodename = nl[0]
nl = list(cfg.filter_node_attributes(
'enclosure.extends=' + nodename))
return None, False
def get_smm_neighbor_fingerprints(smmaddr, cv):
if ':' in smmaddr:
smmaddr = '[{0}]'.format(smmaddr)
wc = webclient.SecureHTTPConnection(smmaddr, verifycallback=cv)
neighs = wc.grab_json_response('/scripts/neighdata.json')
if not neighs:
return
for idx in (4, 5):
if 'sha256' not in neighs[idx]:
continue
yield 'sha256$' + b64tohex(neighs[idx]['sha256'])
def get_nodename(cfg, handler, info):
nodename = None
maccount = None
info['verified'] = False
if not handler:
return None, None
if handler.https_supported:
currcert = handler.https_cert
if not currcert:
info['discofailure'] = 'nohttps'
return None, None
currprint = util.get_fingerprint(currcert, 'sha256')
nodename = nodes_by_fprint.get(currprint, None)
if not nodename:
# Try SHA512 as well
currprint = util.get_fingerprint(currcert)
nodename = nodes_by_fprint.get(currprint, None)
if not nodename:
curruuid = info.get('uuid', None)
if uuid_is_valid(curruuid):
nodename = nodes_by_uuid.get(curruuid, None)
if nodename is None:
_map_unique_ids()
nodename = nodes_by_uuid.get(curruuid, None)
if not nodename:
# Ok, see if it is something with a chassis-uuid and discover by
# chassis
nodename = get_nodename_from_enclosures(cfg, info)
if not nodename and handler.devname == 'SMM':
nodename = get_nodename_from_chained_smms(cfg, handler, info)
if not nodename: # as a last resort, search switches for info
# This is the slowest potential operation, so we hope for the
# best to occur prior to this
nodename, macinfo = macmap.find_nodeinfo_by_mac(info['hwaddr'], cfg)
maccount = macinfo['maccount']
if nodename:
if handler.devname == 'SMM':
nl = list(cfg.filter_node_attributes(
'enclosure.extends=' + nodename))
if nl:
# We found an SMM, and it's in a chain per configuration
# we need to ask the switch for the fingerprint to see
# if we have a match or not
newnodename, v = get_chained_smm_name(nodename, cfg,
handler, nl)
if newnodename:
# while this started by switch, it was disambiguated
info['verified'] = v
return newnodename, None
if (nodename and
not handler.discoverable_by_switch(macinfo['maccount'])):
if handler.devname == 'SMM':
errorstr = 'Attempt to discover SMM by switch, but chained ' \
'topology or incorrect net attributes detected, ' \
'which is not compatible with switch discovery ' \
'of SMM, nodename would have been ' \
'{0}'.format(nodename)
log.log({'error': errorstr})
return None, None
return nodename, maccount
def get_nodename_from_chained_smms(cfg, handler, info):
nodename = None
for fprint in get_smm_neighbor_fingerprints(
handler.ipaddr, lambda x: True):
if fprint in nodes_by_fprint:
# need to chase the whole chain
# to support either direction
chead = get_enclosure_chain_head(nodes_by_fprint[fprint],
cfg)
newnodename, v = get_chained_smm_name(
chead, cfg, handler, checkswitch=False)
if newnodename:
info['verified'] = v
nodename = newnodename
return nodename
def get_node_by_uuid(uuid):
return nodes_by_uuid.get(uuid, None)
def get_nodename_from_enclosures(cfg, info):
nodename = None
cuuid = info.get('attributes', {}).get('chassis-uuid', [None])[0]
if cuuid and cuuid in nodes_by_uuid:
encl = nodes_by_uuid[cuuid]
bay = info.get('enclosure.bay', None)
if bay:
tnl = cfg.filter_node_attributes('enclosure.manager=' + encl)
tnl = list(
cfg.filter_node_attributes('enclosure.bay={0}'.format(bay),
tnl))
if len(tnl) == 1:
# This is not a secure assurance, because it's by
# uuid instead of a key
nodename = tnl[0]
return nodename
def eval_node(cfg, handler, info, nodename, manual=False):
try:
handler.probe() # unicast interrogation as possible to get more data
# switch concurrently
# do some preconfig, for example, to bring a SMM online if applicable
handler.preconfig(nodename)
except Exception as e:
unknown_info[info['hwaddr']] = info
info['discostatus'] = 'unidentified'
errorstr = 'An error occured during discovery, check the ' \
'trace and stderr logs, mac was {0} and ip was {1}' \
', the node or the containing enclosure was {2}' \
''.format(info['hwaddr'], handler.ipaddr, nodename)
traceback.print_exc()
if manual:
raise exc.InvalidArgumentException(errorstr)
log.log({'error': errorstr})
return
# first, if had a bay, it was in an enclosure. If it was discovered by
# switch, it is probably the enclosure manager and not
# the node directly. switch is ambiguous and we should leave it alone
if 'enclosure.bay' in info and handler.is_enclosure:
unknown_info[info['hwaddr']] = info
info['discostatus'] = 'unidentified'
log.log({'error': 'Something that is an enclosure reported a bay, '
'not possible'})
if manual:
raise exc.InvalidArgumentException()
return
nl = list(cfg.filter_node_attributes('enclosure.manager=' + nodename))
if not handler.is_enclosure and nl:
# The specified node is an enclosure (has nodes mapped to it), but
# what we are talking to is *not* an enclosure
# might be ambiguous, need to match chassis-uuid as well..
if 'enclosure.bay' not in info:
unknown_info[info['hwaddr']] = info
info['discostatus'] = 'unidentified'
errorstr = '{2} with mac {0} is in {1}, but unable to ' \
'determine bay number'.format(info['hwaddr'],
nodename,
handler.ipaddr)
if manual:
raise exc.InvalidArgumentException(errorstr)
log.log({'error': errorstr})
return
enl = list(cfg.filter_node_attributes('enclosure.extends=' + nodename))
if enl:
# ambiguous SMM situation according to the configuration, we need
# to match uuid
encuuid = info['attributes'].get('chassis-uuid', None)
if encuuid:
encuuid = encuuid[0]
enl = list(cfg.filter_node_attributes('id.uuid=' + encuuid))
if len(enl) != 1:
# errorstr = 'No SMM by given UUID known, *yet*'
# if manual:
# raise exc.InvalidArgumentException(errorstr)
# log.log({'error': errorstr})
if encuuid in pending_by_uuid:
pending_by_uuid[encuuid].append(info)
else:
pending_by_uuid[encuuid] = [info]
return
# We found the real smm, replace the list with the actual smm
# to continue
nl = list(cfg.filter_node_attributes(
'enclosure.manager=' + enl[0]))
else:
errorstr = 'Chained SMM configuration with older XCC, ' \
'unable to perform zero power discovery'
if manual:
raise exc.InvalidArgumentException(errorstr)
log.log({'error': errorstr})
return
# search for nodes fitting our description using filters
# lead with the most specific to have a small second pass
nl = list(cfg.filter_node_attributes(
'enclosure.bay={0}'.format(info['enclosure.bay']), nl))
if len(nl) != 1:
info['discofailure'] = 'ambigconfig'
if len(nl):
errorstr = 'The following nodes have duplicate ' \
'enclosure attributes: ' + ','.join(nl)
else:
errorstr = 'The {0} in enclosure {1} bay {2} does not ' \
'seem to be a defined node ({3})'.format(
handler.devname, nodename,
info['enclosure.bay'],
handler.ipaddr,
)
if manual:
raise exc.InvalidArgumentException(errorstr)
log.log({'error': errorstr})
unknown_info[info['hwaddr']] = info
info['discostatus'] = 'unidentified'
return
nodename = nl[0]
if not discover_node(cfg, handler, info, nodename, manual):
# store it as pending, assuming blocked on enclosure
# assurance...
pending_nodes[nodename] = info
else:
# we can and did accurately discover by switch or in enclosure
# but... is this really ok? could be on an upstream port or
# erroneously put in the enclosure with no nodes yet
# so first, see if the candidate node is a chain host
if not manual:
if info.get('maccount', False):
# discovery happened through switch
nl = list(cfg.filter_node_attributes(
'enclosure.extends=' + nodename))
if nl:
# The candidate nodename is the head of a chain, we must
# validate the smm certificate by the switch
macmap.get_node_fingerprints(nodename, cfg)
util.handler.cert_matches(fprint, handler.https_cert)
return
if (info.get('maccount', False) and
not handler.discoverable_by_switch(info['maccount'])):
errorstr = 'The detected node {0} was detected using switch, ' \
'however the relevant port has too many macs learned ' \
'for this type of device ({1}) to be discovered by ' \
'switch.'.format(nodename, handler.devname)
log.log({'error': errorstr})
return
if not discover_node(cfg, handler, info, nodename, manual):
pending_nodes[nodename] = info
def discover_node(cfg, handler, info, nodename, manual):
known_nodes[nodename][info['hwaddr']] = info
if info['hwaddr'] in unknown_info:
del unknown_info[info['hwaddr']]
info['discostatus'] = 'identified'
dp = cfg.get_node_attributes(
[nodename], ('discovery.policy',
'pubkeys.tls_hardwaremanager'))
policy = dp.get(nodename, {}).get('discovery.policy', {}).get(
'value', None)
if policy is None:
policy = ''
policies = set(policy.split(','))
lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager',
{}).get('value', None)
# TODO(jjohnson2): permissive requires we guarantee storage of
# the pubkeys, which is deferred for a little bit
# Also, 'secure', when we have the needed infrastructure done
# in some product or another.
curruuid = info.get('uuid', False)
if 'pxe' in policies and info['handler'] == pxeh:
return do_pxe_discovery(cfg, handler, info, manual, nodename, policies)
elif ('permissive' in policies and handler.https_supported and lastfp and
not util.cert_matches(lastfp, handler.https_cert) and not manual):
info['discofailure'] = 'fingerprint'
log.log({'info': 'Detected replacement of {0} with existing '
'fingerprint and permissive discovery policy, not '
'doing discovery unless discovery.policy=open or '
'pubkeys.tls_hardwaremanager attribute is cleared '
'first'.format(nodename)})
return False # With a permissive policy, do not discover new
elif policies & set(('open', 'permissive')) or manual:
info['nodename'] = nodename
if info['handler'] == pxeh:
return do_pxe_discovery(cfg, handler, info, manual, nodename, policies)
elif manual or not util.cert_matches(lastfp, handler.https_cert):
# only 'discover' if it is not the same as last time
try:
handler.config(nodename)
except Exception as e:
info['discofailure'] = 'bug'
if manual:
raise
log.log(
{'error':
'Error encountered trying to set up {0}, {1}'.format(
nodename, str(e))})
traceback.print_exc()
return False
newnodeattribs = {}
if list(cfm.list_collective()):
# We are in a collective, check collective.manager
cmc = cfg.get_node_attributes(nodename, 'collective.manager')
cm = cmc.get(nodename, {}).get('collective.manager', {}).get('value', None)
if not cm:
# Node is being discovered in collective, but no collective.manager, default
# to the collective member actually able to execute the discovery
newnodeattribs['collective.manager'] = collective.get_myname()
if 'uuid' in info:
newnodeattribs['id.uuid'] = info['uuid']
if 'serialnumber' in info:
newnodeattribs['id.serial'] = info['serialnumber']
if 'modelnumber' in info:
newnodeattribs['id.model'] = info['modelnumber']
if handler.https_cert:
newnodeattribs['pubkeys.tls_hardwaremanager'] = \
util.get_fingerprint(handler.https_cert, 'sha256')
if newnodeattribs:
cfg.set_node_attributes({nodename: newnodeattribs})
log.log({'info': 'Discovered {0} ({1})'.format(nodename,
handler.devname)})
info['discostatus'] = 'discovered'
for i in pending_by_uuid.get(curruuid, []):
eventlet.spawn_n(_recheck_single_unknown_info, cfg, i)
try:
del pending_by_uuid[curruuid]
except KeyError:
pass
return True
log.log({'info': 'Detected {0}, but discovery.policy is not set to a '
'value allowing discovery (open or permissive)'.format(
nodename)})
info['discofailure'] = 'policy'
return False
def do_pxe_discovery(cfg, handler, info, manual, nodename, policies):
# use uuid based scheme in lieu of tls cert, ideally only
# for stateless 'discovery' targets like pxe, where data does not
# change
uuidinfo = cfg.get_node_attributes(nodename, ['id.uuid', 'id.serial', 'id.model', 'net*.bootable'])
if manual or policies & set(('open', 'pxe')):
enrich_pxe_info(info)
attribs = {}
olduuid = uuidinfo.get(nodename, {}).get('id.uuid', None)
uuid = info.get('uuid', None)
if uuid and uuid != olduuid:
attribs['id.uuid'] = info['uuid']
sn = info.get('serialnumber', None)
mn = info.get('modelnumber', None)
if sn and sn != uuidinfo.get(nodename, {}).get('id.serial', None):
attribs['id.serial'] = sn
if mn and mn != uuidinfo.get(nodename, {}).get('id.model', None):
attribs['id.model'] = mn
for attrname in uuidinfo.get(nodename, {}):
if attrname.endswith('.bootable') and uuidinfo[nodename][attrname].get('value', None):
newattrname = attrname[:-8] + 'hwaddr'
attribs[newattrname] = info['hwaddr']
if attribs:
cfg.set_node_attributes({nodename: attribs})
if info['uuid'] in known_pxe_uuids:
return True
if uuid_is_valid(info['uuid']):
known_pxe_uuids[info['uuid']] = nodename
#log.log({'info': 'Detected {0} ({1} with mac {2})'.format(
# nodename, handler.devname, info['hwaddr'])})
return True
attribwatcher = None
nodeaddhandler = None
needaddhandled = False
def _handle_nodelist_change(configmanager):
global needaddhandled
global nodeaddhandler
macmap.vintage = 0 # the current mac map is probably inaccurate
_recheck_nodes((), configmanager)
if needaddhandled:
needaddhandled = False
nodeaddhandler = eventlet.spawn(_handle_nodelist_change, configmanager)
else:
nodeaddhandler = None
def newnodes(added, deleting, renamed, configmanager):
global attribwatcher
global needaddhandled
global nodeaddhandler
alldeleting = set(deleting) | set(renamed)
for node in alldeleting:
if node not in known_nodes:
continue
for mac in known_nodes[node]:
if mac in known_info:
del known_info[mac]
del known_nodes[node]
_map_unique_ids()
configmanager.remove_watcher(attribwatcher)
allnodes = configmanager.list_nodes()
attribwatcher = configmanager.watch_attributes(
allnodes, ('discovery.policy', 'net*.switch',
'hardwaremanagement.manager', 'net*.switchport',
'id.uuid', 'pubkeys.tls_hardwaremanager',
'net*.bootable'), _recheck_nodes)
if nodeaddhandler:
needaddhandled = True
else:
nodeaddhandler = eventlet.spawn(_handle_nodelist_change, configmanager)
rechecker = None
rechecktime = None
rechecklock = eventlet.semaphore.Semaphore()
def _periodic_recheck(configmanager):
global rechecker
global rechecktime
rechecker = None
try:
_recheck_nodes((), configmanager)
except Exception:
traceback.print_exc()
log.log({'error': 'Unexpected error during discovery, check debug '
'logs'})
# if rechecker is set, it means that an accelerated schedule
# for rechecker was requested in the course of recheck_nodes
if rechecker is None:
rechecktime = util.monotonic_time() + 900
rechecker = eventlet.spawn_after(900, _periodic_recheck,
configmanager)
def rescan():
_map_unique_ids()
global scanner
if scanner:
return
else:
scanner = eventlet.spawn(blocking_scan)
def blocking_scan():
global scanner
slpscan = eventlet.spawn(slp.active_scan, safe_detected, slp)
ssdpscan = eventlet.spawn(ssdp.active_scan, safe_detected, ssdp)
slpscan.wait()
ssdpscan.wait()
scanner = None
def start_detection():
global attribwatcher
global rechecker
global rechecktime
_map_unique_ids()
cfg = cfm.ConfigManager(None)
allnodes = cfg.list_nodes()
attribwatcher = cfg.watch_attributes(
allnodes, ('discovery.policy', 'net*.switch',
'hardwaremanagement.manager', 'net*.switchport', 'id.uuid',
'pubkeys.tls_hardwaremanager'), _recheck_nodes)
cfg.watch_nodecollection(newnodes)
autosense = cfm.get_global('discovery.autosense')
if autosense or autosense is None:
start_autosense()
if rechecker is None:
rechecktime = util.monotonic_time() + 900
rechecker = eventlet.spawn_after(900, _periodic_recheck, cfg)
eventlet.spawn_n(ssdp.snoop, None, None, ssdp, get_node_by_uuid)
def stop_autosense():
for watcher in list(autosensors):
watcher.kill()
autosensors.discard(watcher)
def start_autosense():
autosensors.add(eventlet.spawn(slp.snoop, safe_detected, slp))
autosensors.add(eventlet.spawn(pxe.snoop, safe_detected, pxe))
nodes_by_fprint = {}
nodes_by_uuid = {}
known_pxe_uuids = {}
def _map_unique_ids(nodes=None):
global nodes_by_uuid
global nodes_by_fprint
nodes_by_uuid = {}
nodes_by_fprint = {}
# Map current known ids based on uuid and fingperprints for fast lookup
cfg = cfm.ConfigManager(None)
if nodes is None:
nodes = cfg.list_nodes()
bigmap = cfg.get_node_attributes(nodes,
('id.uuid',
'pubkeys.tls_hardwaremanager'))
uuid_by_nodes = {}
fprint_by_nodes = {}
for uuid in nodes_by_uuid:
if not uuid_is_valid(uuid):
continue
node = nodes_by_uuid[uuid]
if node in bigmap:
uuid_by_nodes[node] = uuid
for fprint in nodes_by_fprint:
node = nodes_by_fprint[fprint]
if node in bigmap:
fprint_by_nodes[node] = fprint
for node in bigmap:
if node in uuid_by_nodes:
del nodes_by_uuid[uuid_by_nodes[node]]
if node in fprint_by_nodes:
del nodes_by_fprint[fprint_by_nodes[node]]
uuid = bigmap[node].get('id.uuid', {}).get('value', None)
if uuid_is_valid(uuid):
nodes_by_uuid[uuid] = node
fprint = bigmap[node].get(
'pubkeys.tls_hardwaremanager', {}).get('value', None)
if fprint:
nodes_by_fprint[fprint] = node
for uuid in known_pxe_uuids:
if uuid_is_valid(uuid) and uuid not in nodes_by_uuid:
nodes_by_uuid[uuid] = known_pxe_uuids[uuid]
if __name__ == '__main__':
start_detection()
while True:
eventlet.sleep(30)