#!/usr/bin/python3 # frequent problems to check/repair # repair would be to set the uuid global to match filesystem for least disruptive change import os import socket import glob import ssl import sys import confluent.sshutil as sshutil import confluent.certutil as certutil import confluent.config.configmanager as configmanager import subprocess import tempfile import shutil def fprint(txt): sys.stdout.write(txt) sys.stdout.flush() def tftp_works(): try: subprocess.check_call(['curl', '--connect-timeout', '2', '-sf', 'tftp://localhost/confluent/x86_64/ipxe.efi', '-o', '/dev/null']) return True except Exception: return False def emprint(txt): if sys.stdout.isatty(): print('\x1b[1m\x1b[4m' + txt + '\x1b[0m') else: print(txt) def deployment_configured(): return os.path.exists('/var/lib/confluent/public/site/confluent_uuid') def webserver_listening(): try: conn = socket.create_connection(('localhost', 443)) return conn except Exception: return False def certificates_missing_ips(conn): # check if the tls can verify by the right CAs, then further # check if all ip addresses are in the certificate offered ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) ctx.check_hostname = False for cacert in glob.glob('/var/lib/confluent/public/site/tls/*.pem'): ctx.load_verify_locations(cacert) sock = ctx.wrap_socket(conn) crt = sock.getpeercert() sans = crt.get('subjectAltName', []) ips = certutil.get_ip_addresses() missing_ips = [] for ip in ips: for san in sans: field, val = san if val[-1] == '\n': val = val[:-1] if ':' in val: # must normalize ipv6 to a sane value val = socket.getaddrinfo(val, 443, type=socket.SOCK_STREAM)[0][-1][0] if ip == val: break else: missing_ips.append(ip) return missing_ips def web_download_works(): try: subprocess.check_call(['curl', '-skf', 'https://localhost/confluent-public/site/confluent_uuid', '-o', '/dev/null']) except Exception: return False return True def nics_missing_ipv6(): # check for ability to create AF_INET6, for kernel disabled ipv6 a = socket.socket(socket.AF_INET6) ipaddrs = subprocess.check_output(['ip', '-br', 'a']).split(b'\n') missingnics = [] for line in ipaddrs: comps = line.split() if not comps: continue iname, state = comps[:2] if iname == b'lo': continue addrs = comps[2:] hasv6 = False hasv4 = False for addr in addrs: if b'.' in addr: hasv4 = True if addr.startswith(b'fe80::'): hasv6 = True if hasv4 and not hasv6: missingnics.append(iname.decode('utf8')) return missingnics def insecure_boot_attempts(): insecurenodes = set([]) with open('/var/log/confluent/events') as eventin: line = True while line: line = eventin.readline() if 'insecure mode is disabled' in line: line = line.split() insecurenodes.add(line[7]) for node in insecurenodes: currattr = subprocess.check_output(['nodeattrib', node, 'deployment.useinsecureprotocols']) currattr = currattr.split() if len(currattr) > 2 and currattr[2] == b'firmware': continue else: return True return False def uuid_matches(): with open('/var/lib/confluent/public/site/confluent_uuid', 'r') as uuidf: fsuuid = uuidf.read().strip() dbuuid = configmanager.get_global('confluent_uuid') return dbuuid == fsuuid if __name__ == '__main__': sys.stdout.write('OS Deployment: ') sys.stdout.flush() if deployment_configured(): print("Initialized") sys.stdout.write('Confluent UUID: ') sys.stdout.flush() if uuid_matches(): print('Consistent') else: #TODO: need a resolution to suggest emprint('Inconsistent between confluent database and /var/lib/confluent') fprint('Web Server: ') conn = webserver_listening() if conn: print('Running') fprint('Web Certificate: ') cert = certificates_missing_ips(conn) if cert: cert = ', '.join(cert) emprint('Addresses missing from certificate: {0} (Example resolution: osdeploy initialize -t)'.format(cert)) else: print('OK') fprint('Checking web download: ') if web_download_works(): print('OK') else: emprint('Failed to download /confluent-public/site/confluent_uuid') else: emprint('Not Running (Example resolution: systemctl enable httpd --now)') fprint('TFTP Status: ') if tftp_works(): print('OK') else: emprint('TFTP failure, PXE will not work, though media and HTTP boot can still work. (Example resolution: osdeploy initialize -p)') fprint('SSH root user public key: ') if glob.glob('/var/lib/confluent/public/site/ssh/*.rootpubkey'): print('OK') else: emprint('No trusted ssh keys for root user, passwordless SSH from managers to nodes may not work (Example resolution: osdeploy initialize -u)') if sshutil.sshver() > 7.6: fprint('Checking SSH Certificate authority: ') try: sshutil.prep_ssh_key('/etc/confluent/ssh/ca') print('OK') except Exception: emprint('Failed to load SSH authority key, deployed servers will not have host certificates for known_hosts and users may be unable to ssh between nodes without a password (Example resolution: osdeploy initialize -s)') fprint('Checking confluent SSH automation key: ') try: sshutil.prep_ssh_key('/etc/confluent/ssh/automation') print('OK') except subprocess.CalledProcessError: emprint('Failed to load confluent automation key, syncfiles and profile ansible plays will not work (Example resolution: osdeploy initialize -a)') fprint('Checking for blocked insecure boot: ') if insecure_boot_attempts(): emprint('Some nodes are attempting network boot using PXE or HTTP boot, but the node is not configured to allow this (Example resolution: nodegroupattrib everything deployment.useinsecureprotocols=firmware)') else: print('OK') fprint('Checking IPv6 enablement: ') nics = nics_missing_ipv6() if nics: snics = ','.join(nics) emprint('Some interfaces ({0}) have ipv6 disabled, and may be unable to fully perform discovery or deployment (Example resolution: nmcli c m {1} ipv6.method link-local )'.format(snics, nics[0])) else: print('OK') else: print("Uninitialized, further OS deployment checks skipped, see `osdeploy initialize` to set up OS deployment feature")