#!/usr/bin/python3 # frequent problems to check/repair # repair would be to set the uuid global to match filesystem for least disruptive change # # import os import socket import glob import ssl import sys import confluent.sshutil as sshutil import confluent.certutil as certutil import confluent.config.configmanager as configmanager import subprocess import tempfile import shutil def fprint(txt): sys.stdout.write(txt) sys.stdout.flush() def tftp_works(): try: subprocess.check_call(['curl', '--connect-timeout', '2', '-sf', 'tftp://localhost/confluent/x86_64/ipxe.efi', '-o', '/dev/null']) return True except Exception: return False def emprint(txt): if sys.stdout.isatty(): print('\x1b[1m\x1b[4m' + txt + '\x1b[0m') else: print(txt) def deployment_configured(): return os.path.exists('/var/lib/confluent/public/site/confluent_uuid') def webserver_listening(): try: conn = socket.create_connection(('localhost', 443)) return conn except Exception: return False def certificates_missing_ips(conn): # check if the tls can verify by the right CAs, then further # check if all ip addresses are in the certificate offered ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) ctx.check_hostname = False for cacert in glob.glob('/var/lib/confluent/public/site/tls/*.pem'): ctx.load_verify_locations(cacert) sock = ctx.wrap_socket(conn) crt = sock.getpeercert() sans = crt.get('subjectAltName', []) ips = certutil.get_ip_addresses() missing_ips = [] for ip in ips: for san in sans: field, val = san if val[-1] == '\n': val = val[:-1] if ':' in val: # must normalize ipv6 to a sane value val = socket.getaddrinfo(val, 443, type=socket.SOCK_STREAM)[0][-1][0] if ip == val: break else: missing_ips.append(ip) return missing_ips def web_download_works(): try: subprocess.check_call(['curl', '-skf', 'https://localhost/confluent-public/site/confluent_uuid']) except Exception: return False return True def nics_missing_ipv6(): # check for ability to create AF_INET6, for kernel disabled ipv6 a = socket.socket(socket.AF_INET6) ipaddrs = subprocess.check_output(['ip', '-br', 'a']).split(b'\n') missingnics = [] for line in ipaddrs: comps = line.split() if not comps: continue iname, state = comps[:2] if iname == b'lo': continue addrs = comps[2:] hasv6 = False hasv4 = False for addr in addrs: if b'.' in addr: hasv4 = True if addr.startswith(b'fe80::'): hasv6 = True if hasv4 and not hasv6: missingnics.append(iname.decode('utf8')) return missingnics def insecure_boot_attempts(): insecurenodes = set([]) with open('/var/log/confluent/events') as eventin: line = True while line: line = eventin.readline() if 'insecure mode is disabled' in line: line = line.split() insecurenodes.add(line[7]) for node in insecurenodes: currattr = subprocess.check_output(['nodeattrib', node, 'deployment.useinsecureprotocols']) currattr = currattr.split() if len(currattr) > 2 and currattr[2] == b'firmware': continue else: return True return False # check for http access to confluent-public, use site.cpio as file to check? # check for deployment.useinsecureprotocols=firmware def uuid_matches(): with open('/var/lib/confluent/public/site/confluent_uuid', 'r') as uuidf: fsuuid = uuidf.read().strip() dbuuid = configmanager.get_global('confluent_uuid') return dbuuid == fsuuid if __name__ == '__main__': sys.stdout.write('OS Deployment: ') sys.stdout.flush() if deployment_configured(): print("Initialized") sys.stdout.write('Confluent UUID: ') sys.stdout.flush() if uuid_matches(): print('Consistent') else: #TODO: need a resolution to suggest emprint('Inconsistent between confluent database and /var/lib/confluent') fprint('Web Server: ') conn = webserver_listening() if conn: print('Running') fprint('Web Certificate: ') cert = certificates_missing_ips(conn) if cert: cert = ', '.join(cert) emprint('Addresses missing from certificate: {0} (Example resolution: osdeploy initialize -t)'.format(cert)) else: print('OK') else: emprint('Not Running (Example resolution: systemctl enable httpd --now)') fprint('TFTP Status: ') if tftp_works(): print('OK') else: emprint('TFTP failure, PXE will not work, though media and HTTP boot can still work. (Example resolution: osdeploy initialize -p)') fprint('SSH root user public key: ') if glob.glob('/var/lib/confluent/public/site/ssh/*.rootpubkey'): print('OK') else: emprint('No trusted ssh keys for root user, passwordless SSH from managers to nodes may not work (Example resolution: osdeploy initialize -u)') if sshutil.sshver() > 7.6: fprint('Checking SSH Certificate authority: ') try: sshutil.prep_ssh_key('/etc/confluent/ssh/ca') print('OK') except Exception: emprint('Failed to load SSH authority key, deployed servers will not have host certificates for known_hosts and users may be unable to ssh between nodes without a password (Example resolution: osdeploy initialize -s)') fprint('Checking confluent SSH automation key: ') try: sshutil.prep_ssh_key('/etc/confluent/ssh/automation') print('OK') except subprocess.CalledProcessError: emprint('Failed to load confluent automation key, syncfiles and profile ansible plays will not work (Example resolution: osdeploy initialize -a)') fprint('Checking for blocked insecure boot: ') if insecure_boot_attempts(): emprint('Some nodes are attempting network boot using PXE or HTTP boot, but the node is not configured to allow this (Example resolution: nodegroupattrib everything deployment.useinsecureprotocols=firmware)') else: print('OK') fprint('Checking IPv6 enablement: ') nics = nics_missing_ipv6() if nics: snics = ','.join(nics) emprint('Some interfaces ({0}) have ipv6 disabled, and may be unable to fully perform discovery or deployment (Example resolution: nmcli c m {1} ipv6.method link-local )'.format(snics, nics[0])) else: print('OK') else: print("Uninitialized, further OS deployment checks skipped, see `osdeploy initialize` to set up OS deployment feature")