2022-08-30 20:59:40 +00:00
#!/usr/bin/python3
2022-09-29 13:57:43 +00:00
import argparse
2022-08-30 20:59:40 +00:00
import os
import socket
import glob
import ssl
import sys
2022-09-02 14:11:12 +00:00
path = os.path.dirname(os.path.realpath(__file__))
path = os.path.realpath(os.path.join(path, '..', 'lib', 'python'))
if path.startswith('/opt'):
# if installed into system path, do not muck with things
sys.path.append(path)
2022-08-31 21:30:41 +00:00
import confluent.sshutil as sshutil
2022-08-30 20:59:40 +00:00
import confluent.certutil as certutil
2022-09-29 19:27:12 +00:00
import confluent.client as client
2022-08-30 20:59:40 +00:00
import confluent.config.configmanager as configmanager
2022-08-31 21:30:41 +00:00
import subprocess
import tempfile
import shutil
2022-09-29 13:57:43 +00:00
import eventlet.green.socket as socket
import eventlet
import greenlet
2022-08-31 21:30:41 +00:00
def fprint(txt):
sys.stdout.write(txt)
sys.stdout.flush()
def tftp_works():
try:
subprocess.check_call(['curl', '--connect-timeout', '2', '-sf', 'tftp://localhost/confluent/x86_64/ipxe.efi', '-o', '/dev/null'])
return True
except Exception:
return False
2022-08-30 20:59:40 +00:00
def emprint(txt):
if sys.stdout.isatty():
print('\x1b[1m\x1b[4m' + txt + '\x1b[0m')
else:
print(txt)
def deployment_configured():
return os.path.exists('/var/lib/confluent/public/site/confluent_uuid')
def webserver_listening():
try:
conn = socket.create_connection(('localhost', 443))
return conn
except Exception:
return False
def certificates_missing_ips(conn):
# check if the tls can verify by the right CAs, then further
# check if all ip addresses are in the certificate offered
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.check_hostname = False
for cacert in glob.glob('/var/lib/confluent/public/site/tls/*.pem'):
ctx.load_verify_locations(cacert)
sock = ctx.wrap_socket(conn)
crt = sock.getpeercert()
sans = crt.get('subjectAltName', [])
ips = certutil.get_ip_addresses()
missing_ips = []
for ip in ips:
for san in sans:
field, val = san
if val[-1] == '\n':
val = val[:-1]
if ':' in val:
# must normalize ipv6 to a sane value
2022-09-29 19:27:12 +00:00
val = socket.getaddrinfo(val, 443, proto=socket.IPPROTO_TCP)[0][-1][0]
2022-08-31 21:30:41 +00:00
if ip == val:
2022-08-30 20:59:40 +00:00
break
else:
missing_ips.append(ip)
return missing_ips
2022-09-01 17:17:17 +00:00
def web_download_works():
try:
2022-09-01 17:21:39 +00:00
subprocess.check_call(['curl', '-skf', 'https://localhost/confluent-public/site/confluent_uuid', '-o', '/dev/null'])
2022-09-01 17:17:17 +00:00
except Exception:
return False
return True
2022-08-30 20:59:40 +00:00
2022-09-30 16:17:31 +00:00
def web_api_works():
try:
res = subprocess.check_output(['curl', '-X', 'GET', '-Isk', 'https://localhost/confluent-api/'])
res = res.split(b'\n')[0]
if b'401' in res:
return True
return False
except Exception:
return False
return True
2022-08-30 20:59:40 +00:00
2022-09-01 17:17:17 +00:00
def nics_missing_ipv6():
2022-08-30 20:59:40 +00:00
# check for ability to create AF_INET6, for kernel disabled ipv6
2022-09-01 17:17:17 +00:00
a = socket.socket(socket.AF_INET6)
ipaddrs = subprocess.check_output(['ip', '-br', 'a']).split(b'\n')
missingnics = []
for line in ipaddrs:
comps = line.split()
if not comps:
continue
iname, state = comps[:2]
if iname == b'lo':
continue
addrs = comps[2:]
hasv6 = False
hasv4 = False
for addr in addrs:
if b'.' in addr:
hasv4 = True
if addr.startswith(b'fe80::'):
hasv6 = True
if hasv4 and not hasv6:
missingnics.append(iname.decode('utf8'))
return missingnics
def insecure_boot_attempts():
insecurenodes = set([])
with open('/var/log/confluent/events') as eventin:
line = True
while line:
line = eventin.readline()
if 'insecure mode is disabled' in line:
line = line.split()
insecurenodes.add(line[7])
for node in insecurenodes:
currattr = subprocess.check_output(['nodeattrib', node, 'deployment.useinsecureprotocols'])
currattr = currattr.split()
if len(currattr) > 2 and currattr[2] == b'firmware':
continue
else:
return True
return False
2022-08-30 20:59:40 +00:00
def uuid_matches():
with open('/var/lib/confluent/public/site/confluent_uuid', 'r') as uuidf:
2022-08-31 21:30:41 +00:00
fsuuid = uuidf.read().strip()
dbuuid = configmanager.get_global('confluent_uuid')
return dbuuid == fsuuid
2022-08-30 20:59:40 +00:00
2022-09-29 13:57:43 +00:00
def lookup_node(node):
try:
return socket.getaddrinfo(node, 0)
except greenlet.GreenletExit:
return None
except Exception:
return None
2022-08-30 20:59:40 +00:00
if __name__ == '__main__':
2022-09-29 13:57:43 +00:00
ap = argparse.ArgumentParser(description='Run configuration checks for a system running confluent service')
ap.add_argument('-n', '--node', help='A node name to run node specific checks against')
args, extra = ap.parse_known_args(sys.argv)
if len(extra) > 1:
ap.print_help()
sys.exit(1)
2022-08-31 21:30:41 +00:00
sys.stdout.write('OS Deployment: ')
sys.stdout.flush()
2022-08-30 20:59:40 +00:00
if deployment_configured():
2022-08-31 21:30:41 +00:00
print("Initialized")
sys.stdout.write('Confluent UUID: ')
sys.stdout.flush()
if uuid_matches():
print('Consistent')
else:
2022-09-01 17:26:25 +00:00
emprint('Inconsistent between confluent database and /var/lib/confluent (Example resolution: confetty set /uuid resync=1)')
2022-08-31 21:30:41 +00:00
fprint('Web Server: ')
2022-08-30 20:59:40 +00:00
conn = webserver_listening()
if conn:
2022-08-31 21:30:41 +00:00
print('Running')
fprint('Web Certificate: ')
2022-08-30 20:59:40 +00:00
cert = certificates_missing_ips(conn)
if cert:
2022-08-31 21:30:41 +00:00
cert = ', '.join(cert)
emprint('Addresses missing from certificate: {0} (Example resolution: osdeploy initialize -t)'.format(cert))
else:
print('OK')
2022-09-01 17:21:39 +00:00
fprint('Checking web download: ')
if web_download_works():
print('OK')
else:
emprint('Failed to download /confluent-public/site/confluent_uuid')
2022-09-30 16:17:31 +00:00
fprint('Checking web API access: ')
if web_api_works():
print('OK')
else:
emprint('Failed access, if selinux is enabled, `setsebool -P httpd_can_network_connect=1`, otherwise check web proxy configuration')
2022-08-31 21:30:41 +00:00
else:
emprint('Not Running (Example resolution: systemctl enable httpd --now)')
fprint('TFTP Status: ')
if tftp_works():
print('OK')
else:
emprint('TFTP failure, PXE will not work, though media and HTTP boot can still work. (Example resolution: osdeploy initialize -p)')
fprint('SSH root user public key: ')
if glob.glob('/var/lib/confluent/public/site/ssh/*.rootpubkey'):
print('OK')
2022-08-30 20:59:40 +00:00
else:
2022-08-31 21:30:41 +00:00
emprint('No trusted ssh keys for root user, passwordless SSH from managers to nodes may not work (Example resolution: osdeploy initialize -u)')
if sshutil.sshver() > 7.6:
fprint('Checking SSH Certificate authority: ')
try:
sshutil.prep_ssh_key('/etc/confluent/ssh/ca')
2022-09-01 17:17:17 +00:00
print('OK')
2022-08-31 21:30:41 +00:00
except Exception:
emprint('Failed to load SSH authority key, deployed servers will not have host certificates for known_hosts and users may be unable to ssh between nodes without a password (Example resolution: osdeploy initialize -s)')
fprint('Checking confluent SSH automation key: ')
try:
sshutil.prep_ssh_key('/etc/confluent/ssh/automation')
2022-09-01 17:17:17 +00:00
print('OK')
2022-08-31 21:30:41 +00:00
except subprocess.CalledProcessError:
emprint('Failed to load confluent automation key, syncfiles and profile ansible plays will not work (Example resolution: osdeploy initialize -a)')
2022-09-01 17:17:17 +00:00
fprint('Checking for blocked insecure boot: ')
if insecure_boot_attempts():
emprint('Some nodes are attempting network boot using PXE or HTTP boot, but the node is not configured to allow this (Example resolution: nodegroupattrib everything deployment.useinsecureprotocols=firmware)')
else:
print('OK')
fprint('Checking IPv6 enablement: ')
nics = nics_missing_ipv6()
if nics:
snics = ','.join(nics)
emprint('Some interfaces ({0}) have ipv6 disabled, and may be unable to fully perform discovery or deployment (Example resolution: nmcli c m {1} ipv6.method link-local )'.format(snics, nics[0]))
else:
print('OK')
2022-08-30 20:59:40 +00:00
else:
2022-08-31 21:30:41 +00:00
print("Uninitialized, further OS deployment checks skipped, see `osdeploy initialize` to set up OS deployment feature")
2022-09-29 13:57:43 +00:00
if args.node:
print("Performing node checks for '{}'".format(args.node))
2022-09-29 19:45:07 +00:00
print("Checking node attributes in confluent...")
2022-09-29 19:27:12 +00:00
sess = client.Command()
2022-09-29 19:45:07 +00:00
allok = True
2022-09-29 19:27:12 +00:00
uuidok = False
macok = False
for rsp in sess.read(f'/nodes/{args.node}/attributes/all'):
if rsp.get('errorcode', None) == 404:
emprint(f'There is no node named "{args.node}"')
2022-09-29 19:45:07 +00:00
allok = False
2022-09-29 19:27:12 +00:00
uuidok = True # not really, but suppress the spurious error
uuid = rsp.get('id.uuid', {}).get('value', None)
if uuid:
uuidok = True
2022-09-30 16:22:39 +00:00
if 'deployment.useinsecureprotocols' in rsp:
2022-09-30 16:36:12 +00:00
insec = rsp.get('deployment.useinsecureprotocols', {}).get('value', None)
2022-09-30 16:22:39 +00:00
if insec != 'firmware':
emprint(f'{args.node} does not have deployment.useinsecureprotocols set to firmware. This is fine for USB boot. HTTP boot can work, but requires the deployment server CA certificates be provisioned to the UEFI first. PXE and HTTP boot without preprovisioned certificates will fail.')
2022-09-29 19:27:12 +00:00
for key in rsp:
if key.endswith('.hwaddr'):
mac = rsp[key].get('value', None)
if mac:
2022-10-03 14:33:21 +00:00
macok = True
2022-09-29 19:27:12 +00:00
if not uuidok and not macok:
2022-09-29 19:45:07 +00:00
allok = False
2022-09-29 19:27:12 +00:00
emprint(f'{args.node} does not have a uuid or mac address defined in id.uuid or net.*hwaddr, deployment will not work')
2022-09-29 19:45:07 +00:00
if allok:
print(f'No issues detected with attributes of {args.node}')
2022-09-29 13:57:43 +00:00
fprint("Checking name resolution: ")
lk = eventlet.spawn(lookup_node, args.node)
eventlet.sleep(0.1)
tries = 5
while not lk.dead and tries > 0:
eventlet.sleep(1)
tries -= 1
deaddns = False
if not tries:
emprint('Name resolution takes too long, check state of /etc/resolv.conf and indicated nameservers, this can produce failure to netboot or failure to commence installation')
lk.kill()
deaddns = True
result = lk.wait()
if not result and not deaddns:
emprint('Name resolution failed for node, it is normally a good idea for the node name to resolve to an IP')
if result:
print("OK")
else:
print("Skipping node checks, no node specified (Example: confluent_selfcheck -n n1)")
2022-10-03 14:33:21 +00:00
# possible checks:
# arping on the node, check for dupes/against nodeinventory?
# arping -D for mgt own ip addresses? check for dupes, also check for bleed through from one nic to another