2022-08-30 16:59:40 -04:00
#!/usr/bin/python3
# frequent problems to check/repair
# repair would be to set the uuid global to match filesystem for least disruptive change
import os
import socket
import glob
import ssl
import sys
2022-08-31 17:30:41 -04:00
import confluent.sshutil as sshutil
2022-08-30 16:59:40 -04:00
import confluent.certutil as certutil
import confluent.config.configmanager as configmanager
2022-08-31 17:30:41 -04:00
import subprocess
import tempfile
import shutil
def fprint(txt):
sys.stdout.write(txt)
sys.stdout.flush()
def tftp_works():
try:
subprocess.check_call(['curl', '--connect-timeout', '2', '-sf', 'tftp://localhost/confluent/x86_64/ipxe.efi', '-o', '/dev/null'])
return True
except Exception:
return False
2022-08-30 16:59:40 -04:00
def emprint(txt):
if sys.stdout.isatty():
print('\x1b[1m\x1b[4m' + txt + '\x1b[0m')
else:
print(txt)
def deployment_configured():
return os.path.exists('/var/lib/confluent/public/site/confluent_uuid')
def webserver_listening():
try:
conn = socket.create_connection(('localhost', 443))
return conn
except Exception:
return False
def certificates_missing_ips(conn):
# check if the tls can verify by the right CAs, then further
# check if all ip addresses are in the certificate offered
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.check_hostname = False
for cacert in glob.glob('/var/lib/confluent/public/site/tls/*.pem'):
ctx.load_verify_locations(cacert)
sock = ctx.wrap_socket(conn)
crt = sock.getpeercert()
sans = crt.get('subjectAltName', [])
ips = certutil.get_ip_addresses()
missing_ips = []
for ip in ips:
for san in sans:
field, val = san
if val[-1] == '\n':
val = val[:-1]
if ':' in val:
# must normalize ipv6 to a sane value
val = socket.getaddrinfo(val, 443, type=socket.SOCK_STREAM)[0][-1][0]
2022-08-31 17:30:41 -04:00
if ip == val:
2022-08-30 16:59:40 -04:00
break
else:
missing_ips.append(ip)
return missing_ips
2022-09-01 13:17:17 -04:00
def web_download_works():
try:
2022-09-01 13:21:39 -04:00
subprocess.check_call(['curl', '-skf', 'https://localhost/confluent-public/site/confluent_uuid', '-o', '/dev/null'])
2022-09-01 13:17:17 -04:00
except Exception:
return False
return True
2022-08-30 16:59:40 -04:00
2022-09-01 13:17:17 -04:00
def nics_missing_ipv6():
2022-08-30 16:59:40 -04:00
# check for ability to create AF_INET6, for kernel disabled ipv6
2022-09-01 13:17:17 -04:00
a = socket.socket(socket.AF_INET6)
ipaddrs = subprocess.check_output(['ip', '-br', 'a']).split(b'\n')
missingnics = []
for line in ipaddrs:
comps = line.split()
if not comps:
continue
iname, state = comps[:2]
if iname == b'lo':
continue
addrs = comps[2:]
hasv6 = False
hasv4 = False
for addr in addrs:
if b'.' in addr:
hasv4 = True
if addr.startswith(b'fe80::'):
hasv6 = True
if hasv4 and not hasv6:
missingnics.append(iname.decode('utf8'))
return missingnics
def insecure_boot_attempts():
insecurenodes = set([])
with open('/var/log/confluent/events') as eventin:
line = True
while line:
line = eventin.readline()
if 'insecure mode is disabled' in line:
line = line.split()
insecurenodes.add(line[7])
for node in insecurenodes:
currattr = subprocess.check_output(['nodeattrib', node, 'deployment.useinsecureprotocols'])
currattr = currattr.split()
if len(currattr) > 2 and currattr[2] == b'firmware':
continue
else:
return True
return False
2022-08-30 16:59:40 -04:00
def uuid_matches():
with open('/var/lib/confluent/public/site/confluent_uuid', 'r') as uuidf:
2022-08-31 17:30:41 -04:00
fsuuid = uuidf.read().strip()
dbuuid = configmanager.get_global('confluent_uuid')
return dbuuid == fsuuid
2022-08-30 16:59:40 -04:00
if __name__ == '__main__':
2022-08-31 17:30:41 -04:00
sys.stdout.write('OS Deployment: ')
sys.stdout.flush()
2022-08-30 16:59:40 -04:00
if deployment_configured():
2022-08-31 17:30:41 -04:00
print("Initialized")
sys.stdout.write('Confluent UUID: ')
sys.stdout.flush()
if uuid_matches():
print('Consistent')
else:
2022-08-30 16:59:40 -04:00
#TODO: need a resolution to suggest
2022-08-31 17:30:41 -04:00
emprint('Inconsistent between confluent database and /var/lib/confluent')
fprint('Web Server: ')
2022-08-30 16:59:40 -04:00
conn = webserver_listening()
if conn:
2022-08-31 17:30:41 -04:00
print('Running')
fprint('Web Certificate: ')
2022-08-30 16:59:40 -04:00
cert = certificates_missing_ips(conn)
if cert:
2022-08-31 17:30:41 -04:00
cert = ', '.join(cert)
emprint('Addresses missing from certificate: {0} (Example resolution: osdeploy initialize -t)'.format(cert))
else:
print('OK')
2022-09-01 13:21:39 -04:00
fprint('Checking web download: ')
if web_download_works():
print('OK')
else:
emprint('Failed to download /confluent-public/site/confluent_uuid')
2022-08-31 17:30:41 -04:00
else:
emprint('Not Running (Example resolution: systemctl enable httpd --now)')
fprint('TFTP Status: ')
if tftp_works():
print('OK')
else:
emprint('TFTP failure, PXE will not work, though media and HTTP boot can still work. (Example resolution: osdeploy initialize -p)')
fprint('SSH root user public key: ')
if glob.glob('/var/lib/confluent/public/site/ssh/*.rootpubkey'):
print('OK')
2022-08-30 16:59:40 -04:00
else:
2022-08-31 17:30:41 -04:00
emprint('No trusted ssh keys for root user, passwordless SSH from managers to nodes may not work (Example resolution: osdeploy initialize -u)')
if sshutil.sshver() > 7.6:
fprint('Checking SSH Certificate authority: ')
try:
sshutil.prep_ssh_key('/etc/confluent/ssh/ca')
2022-09-01 13:17:17 -04:00
print('OK')
2022-08-31 17:30:41 -04:00
except Exception:
emprint('Failed to load SSH authority key, deployed servers will not have host certificates for known_hosts and users may be unable to ssh between nodes without a password (Example resolution: osdeploy initialize -s)')
fprint('Checking confluent SSH automation key: ')
try:
sshutil.prep_ssh_key('/etc/confluent/ssh/automation')
2022-09-01 13:17:17 -04:00
print('OK')
2022-08-31 17:30:41 -04:00
except subprocess.CalledProcessError:
emprint('Failed to load confluent automation key, syncfiles and profile ansible plays will not work (Example resolution: osdeploy initialize -a)')
2022-09-01 13:17:17 -04:00
fprint('Checking for blocked insecure boot: ')
if insecure_boot_attempts():
emprint('Some nodes are attempting network boot using PXE or HTTP boot, but the node is not configured to allow this (Example resolution: nodegroupattrib everything deployment.useinsecureprotocols=firmware)')
else:
print('OK')
fprint('Checking IPv6 enablement: ')
nics = nics_missing_ipv6()
if nics:
snics = ','.join(nics)
emprint('Some interfaces ({0}) have ipv6 disabled, and may be unable to fully perform discovery or deployment (Example resolution: nmcli c m {1} ipv6.method link-local )'.format(snics, nics[0]))
else:
print('OK')
2022-08-30 16:59:40 -04:00
else:
2022-08-31 17:30:41 -04:00
print("Uninitialized, further OS deployment checks skipped, see `osdeploy initialize` to set up OS deployment feature")