import os import base64 import binascii import json import requests import traceback from subprocess import ( check_call, check_output, CalledProcessError ) from charms.reactive import ( hook, when, when_not, set_state, is_state, remove_state, endpoint_from_flag, register_trigger ) from charms.layer import containerd, status from charms.layer.container_runtime_common import ( ca_crt_path, server_crt_path, server_key_path, check_for_juju_https_proxy ) from charmhelpers.core import ( host, unitdata ) from charmhelpers.core.templating import render from charmhelpers.core.hookenv import ( atexit, config, log, application_version_set ) from charmhelpers.core.kernel import modprobe from charmhelpers.fetch import ( apt_install, apt_update, apt_purge, apt_hold, apt_autoremove, apt_unhold, import_key ) DB = unitdata.kv() CONTAINERD_PACKAGE = 'containerd' register_trigger( when='config.changed.nvidia_apt_key_urls', clear_flag='containerd.nvidia.ready' ) register_trigger( when='config.changed.nvidia_apt_sources', clear_flag='containerd.nvidia.ready' ) register_trigger( when='config.changed.nvidia_apt_packages', clear_flag='containerd.nvidia.ready' ) def _check_containerd(): """ Check that containerd is running. `ctr version` calls both client and server side, so is a reasonable indication that everything's been set up correctly. :return: Boolean """ try: version = check_output(['ctr', 'version']) except (FileNotFoundError, CalledProcessError): return None return version def _juju_proxy_changed(): """ Check to see if the Juju model HTTP(S) proxy settings have changed. These aren't propagated to the charm so we'll need to do it here. :return: Boolean """ cached = DB.get('config-cache', None) if not cached: return True # First pass. new = check_for_juju_https_proxy(config) if cached['http_proxy'] == new['http_proxy'] and \ cached['https_proxy'] == new['https_proxy'] and \ cached['no_proxy'] == new['no_proxy']: return False return True @atexit def charm_status(): """ Set the charm's status after each hook is run. :return: None """ if is_state('upgrade.series.in-progress'): status.blocked('Series upgrade in progress') elif is_state('containerd.nvidia.invalid-option'): status.blocked( '{} is an invalid option for gpu_driver'.format( config().get('gpu_driver') ) ) elif _check_containerd(): status.active('Container runtime available') set_state('containerd.ready') else: status.blocked('Container runtime not available') def strip_url(url): """Strip the URL of protocol, slashes etc., and keep host:port. Examples: url: http://10.10.10.10:8000 --> return: 10.10.10.10:8000 url: https://myregistry.io:8000/ --> return: myregistry.io:8000 url: myregistry.io:8000 --> return: myregistry.io:8000 """ return url.rstrip('/').split(sep='://', maxsplit=1)[-1] def update_custom_tls_config(config_directory, registries, old_registries): """ Read registries config and remove old/write new tls files from/to disk. :param str config_directory: containerd config directory :param List registries: juju config for custom registries :param List old_registries: old juju config for custom registries :return: None """ # Remove tls files of old registries; so not to leave uneeded, stale files. for registry in old_registries: for opt in ['ca', 'key', 'cert']: file_b64 = registry.get('%s_file' % opt) if file_b64: registry[opt] = os.path.join( config_directory, "%s.%s" % (strip_url(registry['url']), opt) ) if os.path.isfile(registry[opt]): os.remove(registry[opt]) # Write tls files of new registries. for registry in registries: for opt in ['ca', 'key', 'cert']: file_b64 = registry.get('%s_file' % opt) if file_b64: try: file_contents = base64.b64decode(file_b64) except (binascii.Error, TypeError): log(traceback.format_exc()) log("{}:{} didn't look like base64 data... skipping" .format(registry['url'], opt)) continue registry[opt] = os.path.join( config_directory, "%s.%s" % (strip_url(registry['url']), opt) ) with open(registry[opt], 'wb') as f: f.write(file_contents) def populate_host_for_custom_registries(custom_registries): """Populate host field from url if missing for custom registries. Examples: url: http://10.10.10.10:8000 --> host: 10.10.10.10:8000 url: https://myregistry.io:8000/ --> host: myregistry.io:8000 url: myregistry.io:8000 --> host: myregistry.io:8000 """ # only do minimal changes to custom_registries when conditions apply # otherwise return it directly as it is if isinstance(custom_registries, list): for registry in custom_registries: if not registry.get('host'): url = registry.get('url') if url: registry['host'] = strip_url(url) return custom_registries def insert_docker_io_to_custom_registries(custom_registries): """ Ensure the default docker.io registry exists. Also gives a way for configuration to override the url for it. If a docker.io host entry doesn't exist, we'll add one. """ if isinstance(custom_registries, list): if not any(d.get('host') == 'docker.io' for d in custom_registries): custom_registries.insert(0, { "host": "docker.io", "url": "https://registry-1.docker.io" }) return custom_registries def merge_custom_registries(config_directory, custom_registries, old_custom_registries): """ Merge custom registries and Docker registries from relation. :param str config_directory: containerd config directory :param str custom_registries: juju config for custom registries :param str old_custom_registries: old juju config for custom registries :return: List Dictionary merged registries """ registries = [] registries += json.loads(custom_registries) # json string already converted to python list here registries = populate_host_for_custom_registries(registries) registries = insert_docker_io_to_custom_registries(registries) old_registries = [] if (old_custom_registries): old_registries += json.loads(old_custom_registries) update_custom_tls_config(config_directory, registries, old_registries) docker_registry = DB.get('registry', None) if docker_registry: registries.append(docker_registry) return registries @hook('update-status') def update_status(): """ Triggered when update-status is called. :return: None """ if _juju_proxy_changed(): set_state('containerd.juju-proxy.changed') @hook('upgrade-charm') def upgrade_charm(): """ Triggered when upgrade-charm is called. :return: None """ # Prevent containerd apt pkg from being implicitly updated. apt_hold(CONTAINERD_PACKAGE) # Re-render config in case the template has changed in the new charm. config_changed() # Clean up old nvidia sources.list.d files old_source_files = [ '/etc/apt/sources.list.d/nvidia-container-runtime.list', '/etc/apt/sources.list.d/cuda.list' ] for source_file in old_source_files: if os.path.exists(source_file): os.remove(source_file) remove_state('containerd.nvidia.ready') @when_not('containerd.br_netfilter.enabled') def enable_br_netfilter_module(): """ Enable br_netfilter to work around https://github.com/kubernetes/kubernetes/issues/21613. :return: None """ try: modprobe('br_netfilter', persist=True) except Exception: log(traceback.format_exc()) if host.is_container(): log('LXD detected, ignoring failure to load br_netfilter') else: log('LXD not detected, will retry loading br_netfilter') return set_state('containerd.br_netfilter.enabled') @when_not('containerd.ready', 'containerd.installed', 'endpoint.containerd.departed') def install_containerd(): """ Install containerd and then create initial configuration. :return: None """ status.maintenance('Installing containerd via apt') apt_update() apt_install(CONTAINERD_PACKAGE, fatal=True) apt_hold(CONTAINERD_PACKAGE) set_state('containerd.installed') config_changed() @when('containerd.installed') @when_not('containerd.version-published') def publish_version_to_juju(): """ Publish the containerd version to Juju. :return: None """ version_string = _check_containerd() if not version_string: return version = version_string.split()[6].split(b'-')[0].decode() application_version_set(version) set_state('containerd.version-published') @when_not('containerd.nvidia.checked') @when_not('endpoint.containerd.departed') def check_for_gpu(): """ Check if an Nvidia GPU exists. :return: None """ valid_options = [ 'auto', 'none', 'nvidia' ] driver_config = config().get('gpu_driver') if driver_config not in valid_options: set_state('containerd.nvidia.invalid-option') return out = check_output(['lspci', '-nnk']).rstrip().decode('utf-8').lower() if driver_config != 'none': if (out.count('nvidia') > 0 and driver_config == 'auto') \ or (driver_config == 'nvidia'): set_state('containerd.nvidia.available') else: remove_state('containerd.nvidia.available') remove_state('containerd.nvidia.ready') remove_state('containerd.nvidia.invalid-option') set_state('containerd.nvidia.checked') @when('containerd.nvidia.available') @when_not('containerd.nvidia.ready', 'endpoint.containerd.departed') def configure_nvidia(): """ Based on charm config, install and configure Nivida drivers. :return: None """ status.maintenance('Installing Nvidia drivers.') dist = host.lsb_release() os_release_id = dist['DISTRIB_ID'].lower() os_release_version_id = dist['DISTRIB_RELEASE'] os_release_version_id_no_dot = os_release_version_id.replace('.', '') proxies = { "http": config('http_proxy'), "https": config('https_proxy') } key_urls = config('nvidia_apt_key_urls').split() for key_url in key_urls: formatted_key_url = key_url.format( id=os_release_id, version_id=os_release_version_id, version_id_no_dot=os_release_version_id_no_dot ) gpg_key = requests.get(formatted_key_url, proxies=proxies).text import_key(gpg_key) sources = config('nvidia_apt_sources').splitlines() formatted_sources = [ source.format( id=os_release_id, version_id=os_release_version_id, version_id_no_dot=os_release_version_id_no_dot ) for source in sources ] with open('/etc/apt/sources.list.d/nvidia.list', 'w') as f: f.write('\n'.join(formatted_sources)) apt_update() packages = config('nvidia_apt_packages').split() apt_install(packages, fatal=True) set_state('containerd.nvidia.ready') config_changed() @when('endpoint.containerd.departed') def purge_containerd(): """ Purge Containerd from the cluster. :return: None """ status.maintenance('Removing containerd from principal') host.service_stop('containerd.service') apt_unhold(CONTAINERD_PACKAGE) apt_purge(CONTAINERD_PACKAGE, fatal=True) if is_state('containerd.nvidia.ready'): nvidia_packages = config('nvidia_apt_packages').split() apt_purge(nvidia_packages, fatal=True) sources = [ '/etc/apt/sources.list.d/nvidia.list' ] for f in sources: if os.path.isfile(f): os.remove(f) apt_autoremove(purge=True, fatal=True) remove_state('containerd.ready') remove_state('containerd.installed') remove_state('containerd.nvidia.ready') remove_state('containerd.nvidia.checked') remove_state('containerd.nvidia.available') remove_state('containerd.version-published') @when('config.changed.gpu_driver') def gpu_config_changed(): """ Remove the GPU checked state when the config is changed. :return: None """ remove_state('containerd.nvidia.checked') @when('config.changed') @when_not('endpoint.containerd.departed') def config_changed(): """ Render the config template. :return: None """ if _juju_proxy_changed(): set_state('containerd.juju-proxy.changed') # Create "dumb" context based on Config to avoid triggering config.changed context = dict(config()) if context['config_version'] == "v2": template_config = "config_v2.toml" else: template_config = "config.toml" config_file = 'config.toml' config_directory = '/etc/containerd' endpoint = endpoint_from_flag('endpoint.containerd.available') if endpoint: sandbox_image = endpoint.get_sandbox_image() if sandbox_image: log('Setting sandbox_image to: {}'.format(sandbox_image)) context['sandbox_image'] = sandbox_image else: context['sandbox_image'] = containerd.get_sandbox_image() else: context['sandbox_image'] = containerd.get_sandbox_image() if not os.path.isdir(config_directory): os.mkdir(config_directory) # If custom_registries changed, make sure to remove old tls files. if config().changed('custom_registries'): old_custom_registries = config().previous('custom_registries') else: old_custom_registries = None context['custom_registries'] = \ merge_custom_registries(config_directory, context['custom_registries'], old_custom_registries) untrusted = DB.get('untrusted') if untrusted: context['untrusted'] = True context['untrusted_name'] = untrusted['name'] context['untrusted_path'] = untrusted['binary_path'] context['untrusted_binary'] = os.path.basename( untrusted['binary_path']) else: context['untrusted'] = False if is_state('containerd.nvidia.available') \ and context.get('runtime') == 'auto': context['runtime'] = 'nvidia-container-runtime' if not is_state('containerd.nvidia.available') \ and context.get('runtime') == 'auto': context['runtime'] = 'runc' render( template_config, os.path.join(config_directory, config_file), context ) set_state('containerd.restart') @when('containerd.installed') @when('containerd.juju-proxy.changed') @when_not('endpoint.containerd.departed') def proxy_changed(): """ Apply new proxy settings. :return: None """ # Create "dumb" context based on Config # to avoid triggering config.changed. context = check_for_juju_https_proxy(config) service_file = 'proxy.conf' service_directory = '/etc/systemd/system/containerd.service.d' service_path = os.path.join(service_directory, service_file) if context.get('http_proxy') or \ context.get('https_proxy') or context.get('no_proxy'): os.makedirs(service_directory, exist_ok=True) log('Proxy changed, writing new file to {}'.format(service_path)) render( service_file, service_path, context ) else: try: log('Proxy cleaned, removing file {}'.format(service_path)) os.remove(service_path) except FileNotFoundError: return # We don't need to restart the daemon. DB.set('config-cache', context) remove_state('containerd.juju-proxy.changed') check_call(['systemctl', 'daemon-reload']) set_state('containerd.restart') @when('containerd.restart') @when_not('endpoint.containerd.departed') def restart_containerd(): """ Restart the containerd service. If the restart fails, this function will log a message and be retried on the next hook. """ status.maintenance('Restarting containerd') if host.service_restart('containerd.service'): remove_state('containerd.restart') else: log('Failed to restart containerd; will retry') @when('containerd.ready') @when('endpoint.containerd.joined') @when_not('endpoint.containerd.departed') def publish_config(): """ Pass configuration to principal charm. :return: None """ endpoint = endpoint_from_flag('endpoint.containerd.joined') endpoint.set_config( socket='unix:///var/run/containerd/containerd.sock', runtime='remote', # TODO handle in k8s worker. nvidia_enabled=is_state('containerd.nvidia.available') ) @when('endpoint.untrusted.available') @when_not('untrusted.configured') @when_not('endpoint.containerd.departed') def untrusted_available(): """ Handle untrusted container runtime. :return: None """ untrusted_runtime = endpoint_from_flag('endpoint.untrusted.available') received = dict(untrusted_runtime.get_config()) if 'name' not in received.keys(): return # Try until config is available. DB.set('untrusted', received) config_changed() set_state('untrusted.configured') @when('endpoint.untrusted.departed') def untrusted_departed(): """ Handle untrusted container runtime. :return: None """ DB.unset('untrusted') DB.flush() config_changed() remove_state('untrusted.configured') @when('endpoint.docker-registry.ready') @when_not('containerd.registry.configured') def configure_registry(): """ Add docker registry config when present. :return: None """ registry = endpoint_from_flag('endpoint.docker-registry.ready') docker_registry = { 'url': registry.registry_netloc } # Handle auth data. if registry.has_auth_basic(): docker_registry['username'] = registry.basic_user docker_registry['password'] = registry.basic_password # Handle TLS data. if registry.has_tls(): # Ensure the CA that signed our registry cert is trusted. host.install_ca_cert(registry.tls_ca, name='juju-docker-registry') docker_registry['ca'] = str(ca_crt_path) docker_registry['key'] = str(server_key_path) docker_registry['cert'] = str(server_crt_path) DB.set('registry', docker_registry) config_changed() set_state('containerd.registry.configured') @when('endpoint.docker-registry.changed', 'containerd.registry.configured') def reconfigure_registry(): """ Signal to update the registry config when something changes. :return: None """ remove_state('containerd.registry.configured') @when('endpoint.containerd.reconfigure') @when_not('endpoint.containerd.departed') def container_runtime_relation_changed(): """ Run config_changed to use any new config from the endpoint. :return: None """ config_changed() endpoint = endpoint_from_flag('endpoint.containerd.reconfigure') endpoint.handle_remote_config() @when('containerd.registry.configured') @when_not('endpoint.docker-registry.joined') def remove_registry(): """ Remove registry config when the registry is no longer present. :return: None """ docker_registry = DB.get('registry', None) if docker_registry: # Remove from DB. DB.unset('registry') DB.flush() # Remove auth-related data. log('Disabling auth for docker registry: {}.'.format( docker_registry['url'])) config_changed() remove_state('containerd.registry.configured')