Charmed-Kubernetes/containerd/reactive/containerd.py

872 lines
26 KiB
Python

import contextlib
import os
import base64
import binascii
import json
import traceback
from subprocess import check_call, check_output, CalledProcessError
from typing import List, Mapping, Optional, Set
import urllib.request
import urllib.error
from charms.reactive import (
hook,
when,
when_not,
set_state,
is_state,
remove_state,
endpoint_from_flag,
register_trigger,
)
from charms.layer import containerd, status
from charms.layer.container_runtime_common import (
ca_crt_path,
server_crt_path,
server_key_path,
check_for_juju_https_proxy,
)
from charmhelpers.core import host, unitdata
from charmhelpers.core.templating import render
from charmhelpers.core.hookenv import atexit, config, env_proxy_settings, log, application_version_set
from charmhelpers.core.kernel import modprobe
from charmhelpers.fetch import (
apt_cache,
apt_install,
apt_update,
apt_purge,
apt_hold,
apt_autoremove,
apt_unhold,
import_key,
)
from charmhelpers.fetch.ubuntu_apt_pkg import Package
def apt_packages(packages: Set[str]) -> Mapping[str, Package]:
"""Return a mapping of package names to Package classes.
Ignores all packages which aren't available to apt
Includes any package which wildcard matches with an installed package
@param packages: List of packages for which to search
@returns: Map of available packages
"""
result = {}
if not packages:
return result
cache = apt_cache()
# also search for any already installed packages matching
wildcards = [_ + "*" for _ in packages]
packages = set(cache.dpkg_list(wildcards).keys()) | set(packages)
for pkg_name in packages:
try:
result[pkg_name] = cache[pkg_name]
except KeyError:
log(f"Cannot find {pkg_name} in apt.")
return result
@contextlib.contextmanager
def proxy_env():
"""Create a context to temporarily modify proxy in os.environ."""
restore = {**os.environ} # Copy the current os.environ
# overwrite JUJU_CHARM_*_PROXY from config where available
for key in ["http_proxy", "https_proxy", "no_proxy"]:
val = config(key)
if val:
os.environ[f"JUJU_CHARM_{key.upper()}"] = val
juju_proxies = env_proxy_settings() or {}
os.environ.update(**juju_proxies) # Insert or Update the os.environ
yield os.environ
for key in juju_proxies:
del os.environ[key] # remove any keys which were added or updated
os.environ.update(**restore) # restore any original values
def fetch_url_text(urls) -> List[Optional[str]]:
"""Fetch url text within a proxied environment.
returns: None in the event the url yielded no response.
"""
# updates os.environ to include juju proxy settings
with proxy_env():
handlers = [urllib.request.ProxyHandler()]
opener = urllib.request.build_opener(*handlers)
responses = []
for url in urls:
resp = None
try:
resp = opener.open(url).read().decode()
except urllib.error.HTTPError as e:
log(f"Cannot fetch url='{url}' with code {e.code} {e.reason}")
except urllib.error.URLError as e:
log(f"Cannot fetch url='{url}' with {e.reason}")
finally:
responses.append(resp)
return responses
DB = unitdata.kv()
CONTAINERD_PACKAGE = "containerd"
register_trigger(when="config.changed.nvidia_apt_key_urls", clear_flag="containerd.nvidia.ready")
register_trigger(when="config.changed.nvidia_apt_sources", clear_flag="containerd.nvidia.ready")
register_trigger(when="config.changed.nvidia_apt_packages", clear_flag="containerd.nvidia.ready")
def _check_containerd():
"""
Check that containerd is running.
`ctr version` calls both client and server side, so is a reasonable indication that everything's been set up
correctly.
:return: Boolean
"""
try:
version = check_output(["ctr", "version"])
except (FileNotFoundError, CalledProcessError):
return None
return version
def _juju_proxy_changed():
"""
Check to see if the Juju model HTTP(S) proxy settings have changed.
These aren't propagated to the charm so we'll need to do it here.
:return: Boolean
"""
cached = DB.get("config-cache", None)
if not cached:
return True # First pass.
new = check_for_juju_https_proxy(config)
if (
cached["http_proxy"] == new["http_proxy"]
and cached["https_proxy"] == new["https_proxy"]
and cached["no_proxy"] == new["no_proxy"]
):
return False
return True
@atexit
def charm_status():
"""
Set the charm's status after each hook is run.
:return: None
"""
if is_state("upgrade.series.in-progress"):
status.blocked("Series upgrade in progress")
elif is_state("containerd.nvidia.invalid-option"):
status.blocked("{} is an invalid option for gpu_driver".format(config().get("gpu_driver")))
elif _check_containerd():
status.active("Container runtime available")
set_state("containerd.ready")
else:
status.blocked("Container runtime not available")
def strip_url(url):
"""Strip the URL of protocol, slashes etc., and keep host:port.
Examples:
url: http://10.10.10.10:8000 --> return: 10.10.10.10:8000
url: https://myregistry.io:8000/ --> return: myregistry.io:8000
url: myregistry.io:8000 --> return: myregistry.io:8000
"""
return url.rstrip("/").split(sep="://", maxsplit=1)[-1]
def update_custom_tls_config(config_directory, registries, old_registries):
"""
Read registries config and remove old/write new tls files from/to disk.
:param str config_directory: containerd config directory
:param List registries: juju config for custom registries
:param List old_registries: old juju config for custom registries
:return: None
"""
# Remove tls files of old registries; so not to leave uneeded, stale files.
for registry in old_registries:
for opt in ["ca", "key", "cert"]:
file_b64 = registry.get("%s_file" % opt)
if file_b64:
registry[opt] = os.path.join(config_directory, "%s.%s" % (strip_url(registry["url"]), opt))
if os.path.isfile(registry[opt]):
os.remove(registry[opt])
# Write tls files of new registries.
for registry in registries:
for opt in ["ca", "key", "cert"]:
file_b64 = registry.get("%s_file" % opt)
if file_b64:
try:
file_contents = base64.b64decode(file_b64)
except (binascii.Error, TypeError):
log(traceback.format_exc())
log("{}:{} didn't look like base64 data... skipping".format(registry["url"], opt))
continue
registry[opt] = os.path.join(config_directory, "%s.%s" % (strip_url(registry["url"]), opt))
with open(registry[opt], "wb") as f:
f.write(file_contents)
def populate_host_for_custom_registries(custom_registries):
"""Populate host field from url if missing for custom registries.
Examples:
url: http://10.10.10.10:8000 --> host: 10.10.10.10:8000
url: https://myregistry.io:8000/ --> host: myregistry.io:8000
url: myregistry.io:8000 --> host: myregistry.io:8000
"""
# only do minimal changes to custom_registries when conditions apply
# otherwise return it directly as it is
if isinstance(custom_registries, list):
for registry in custom_registries:
if not registry.get("host"):
url = registry.get("url")
if url:
registry["host"] = strip_url(url)
return custom_registries
def insert_docker_io_to_custom_registries(custom_registries):
"""
Ensure the default docker.io registry exists.
Also gives a way for configuration to override the url for it.
If a docker.io host entry doesn't exist, we'll add one.
"""
if isinstance(custom_registries, list):
if not any(d.get("host") == "docker.io" for d in custom_registries):
custom_registries.insert(0, {"host": "docker.io", "url": "https://registry-1.docker.io"})
return custom_registries
class InvalidCustomRegistriesError(Exception):
"""Error for Invalid Registry decoding."""
def _registries_list(registries, default=None):
"""
Parse registry config and ensure it returns a list or raises ValueError.
:param str registries: representation of registries
:param default: if provided, return rather than raising exceptions
:return: List of registry objects
"""
registry_list = default
try:
registry_list = json.loads(registries)
except json.JSONDecodeError:
if default is None:
raise
if not isinstance(registry_list, list):
if default is None:
raise InvalidCustomRegistriesError("'{}' is not a list".format(registries))
else:
return default
return registry_list
def merge_custom_registries(config_directory, custom_registries, old_custom_registries):
"""
Merge custom registries and Docker registries from relation.
:param str config_directory: containerd config directory
:param str custom_registries: juju config for custom registries
:param str old_custom_registries: old juju config for custom registries
:return: List Dictionary merged registries
"""
registries = []
registries += _registries_list(custom_registries, default=[])
# json string already converted to python list here
registries = populate_host_for_custom_registries(registries)
registries = insert_docker_io_to_custom_registries(registries)
old_registries = []
if old_custom_registries:
old_registries += _registries_list(old_custom_registries, default=[])
update_custom_tls_config(config_directory, registries, old_registries)
docker_registry = DB.get("registry", None)
if docker_registry:
registries.append(docker_registry)
return registries
def invalid_custom_registries(custom_registries):
"""
Validate custom registries from config.
:param str custom_registries: juju config for custom registries
:return: error string for blocked status if condition exists, None otherwise
:rtype: Optional[str]
"""
try:
registries = _registries_list(custom_registries)
except json.JSONDecodeError:
log(traceback.format_exc())
return "Failed to decode json string"
except InvalidCustomRegistriesError:
log(traceback.format_exc())
return "custom_registries is not a list"
required_fields = ["url"]
str_fields = [
"url",
"host",
"username",
"password",
"ca_file",
"cert_file",
"key_file",
]
truthy_fields = [
"insecure_skip_verify",
]
host_set = set()
for idx, registry in enumerate(registries):
if not isinstance(registry, dict):
return "registry #{} is not in object form".format(idx)
for field in required_fields:
if field not in registry:
return "registry #{} missing required field {}".format(idx, field)
for field in required_fields + str_fields:
value = registry.get(field)
if value and not isinstance(value, str):
return "registry #{} field {}={} is not a string".format(idx, field, value)
for field in truthy_fields:
value = registry.get(field)
if field in registry and not isinstance(value, bool):
return "registry #{} field {}='{}' is not a boolean".format(idx, field, value)
for field in registry:
if field not in str_fields + truthy_fields:
return "registry #{} field {} may not be specified".format(idx, field)
this_host = registry.get("host") or strip_url(registry["url"])
if this_host in host_set:
return "registry #{} defines {} more than once".format(idx, this_host)
host_set.add(this_host)
@hook("update-status")
def update_status():
"""
Triggered when update-status is called.
:return: None
"""
if _juju_proxy_changed():
set_state("containerd.juju-proxy.changed")
@hook("upgrade-charm")
def upgrade_charm():
"""
Triggered when upgrade-charm is called.
:return: None
"""
# Prevent containerd apt pkg from being implicitly updated.
apt_hold(CONTAINERD_PACKAGE)
# Re-render config in case the template has changed in the new charm.
config_changed()
# Clean up old nvidia sources.list.d files
old_source_files = [
"/etc/apt/sources.list.d/nvidia-container-runtime.list",
"/etc/apt/sources.list.d/cuda.list",
]
for source_file in old_source_files:
if os.path.exists(source_file):
os.remove(source_file)
remove_state("containerd.nvidia.ready")
@when_not("containerd.br_netfilter.enabled")
def enable_br_netfilter_module():
"""
Enable br_netfilter to work around https://github.com/kubernetes/kubernetes/issues/21613.
:return: None
"""
try:
modprobe("br_netfilter", persist=True)
except Exception:
log(traceback.format_exc())
if host.is_container():
log("LXD detected, ignoring failure to load br_netfilter")
else:
log("LXD not detected, will retry loading br_netfilter")
return
set_state("containerd.br_netfilter.enabled")
@when_not("containerd.ready", "containerd.installed", "endpoint.containerd.departed")
def install_containerd():
"""
Install containerd and then create initial configuration.
:return: None
"""
status.maintenance("Installing containerd via apt")
apt_update()
apt_install(CONTAINERD_PACKAGE, fatal=True)
apt_hold(CONTAINERD_PACKAGE)
set_state("containerd.installed")
config_changed()
@when("containerd.installed")
@when_not("containerd.version-published")
def publish_version_to_juju():
"""
Publish the containerd version to Juju.
:return: None
"""
version_string = _check_containerd()
if not version_string:
return
version = version_string.split()[6].split(b"-")[0].decode()
application_version_set(version)
set_state("containerd.version-published")
@when_not("containerd.nvidia.checked")
@when_not("endpoint.containerd.departed")
def check_for_gpu():
"""
Check if an Nvidia GPU exists.
:return: None
"""
valid_options = ["auto", "none", "nvidia"]
driver_config = config().get("gpu_driver")
if driver_config not in valid_options:
set_state("containerd.nvidia.invalid-option")
return
out = check_output(["lspci", "-nnk"]).rstrip().decode("utf-8").lower()
nvidia_pci, auto = out.count("nvidia"), driver_config == "auto"
if driver_config == "none" or (auto and not nvidia_pci):
# prevent/remove nvidia driver from activating
# because of config or no nvidia hardware found
remove_state("containerd.nvidia.available")
if driver_config == "nvidia" or (auto and nvidia_pci):
# allow/install nvidia drivers to activate
# because of config or this found nvidia hardware
set_state("containerd.nvidia.available")
remove_state("containerd.nvidia.invalid-option")
set_state("containerd.nvidia.checked")
@when("containerd.nvidia.ready")
@when_not("containerd.nvidia.available")
def unconfigure_nvidia(reconfigure=True):
"""
Based on charm config, remove NVIDIA drivers.
:return: None
"""
status.maintenance("Removing Nvidia drivers.")
nvidia_packages = config("nvidia_apt_packages").split()
to_purge = apt_packages(nvidia_packages).keys()
if to_purge:
apt_purge(to_purge, fatal=True)
source = "/etc/apt/sources.list.d/nvidia.list"
if os.path.isfile(source):
os.remove(source)
if to_purge:
apt_autoremove(purge=True, fatal=True)
remove_state("containerd.nvidia.ready")
if reconfigure:
config_changed()
@when("containerd.nvidia.available")
@when_not("containerd.nvidia.ready", "endpoint.containerd.departed")
def configure_nvidia(reconfigure=True):
"""Based on charm config, install and configure Nivida drivers.
:return: None
"""
# Fist remove any existing nvidia drivers
unconfigure_nvidia(reconfigure=False)
status.maintenance("Installing Nvidia drivers.")
dist = host.lsb_release()
os_release_id = dist["DISTRIB_ID"].lower()
os_release_version_id = dist["DISTRIB_RELEASE"]
os_release_version_id_no_dot = os_release_version_id.replace(".", "")
key_urls = config("nvidia_apt_key_urls").split()
formatted_key_urls = [
key_url.format(
id=os_release_id, version_id=os_release_version_id, version_id_no_dot=os_release_version_id_no_dot
)
for key_url in key_urls
]
if formatted_key_urls:
fetched_keys = fetch_url_text(formatted_key_urls)
if not all(fetched_keys):
status.blocked("Failed to fetch nvidia_apt_key_urls.")
return
for key in fetched_keys:
import_key(key)
sources = config("nvidia_apt_sources").splitlines()
formatted_sources = [
source.format(
id=os_release_id,
version_id=os_release_version_id,
version_id_no_dot=os_release_version_id_no_dot,
)
for source in sources
]
with open("/etc/apt/sources.list.d/nvidia.list", "w") as f:
f.write("\n".join(formatted_sources))
apt_update()
nvidia_packages = config("nvidia_apt_packages").split()
if not nvidia_packages:
status.blocked("No NVIDIA packages selected to install.")
return
apt_install(nvidia_packages, fatal=True)
set_state("containerd.nvidia.ready")
if reconfigure:
config_changed()
@when("endpoint.containerd.departed")
def purge_containerd():
"""
Purge Containerd from the cluster.
:return: None
"""
status.maintenance("Removing containerd from principal")
host.service_stop("containerd.service")
apt_unhold(CONTAINERD_PACKAGE)
apt_purge(CONTAINERD_PACKAGE, fatal=True)
if is_state("containerd.nvidia.ready"):
unconfigure_nvidia(reconfigure=False)
apt_autoremove(purge=True, fatal=True)
remove_state("containerd.ready")
remove_state("containerd.installed")
remove_state("containerd.nvidia.ready")
remove_state("containerd.nvidia.checked")
remove_state("containerd.nvidia.available")
remove_state("containerd.version-published")
@when("config.changed.gpu_driver")
def gpu_config_changed():
"""
Remove the GPU checked state when the config is changed.
:return: None
"""
remove_state("containerd.nvidia.checked")
CONFIG_DIRECTORY = "/etc/containerd"
CONFIG_FILE = "config.toml"
@when("config.changed")
@when_not("endpoint.containerd.departed")
def config_changed():
"""
Render the config template.
:return: None
"""
if _juju_proxy_changed():
set_state("containerd.juju-proxy.changed")
# Create "dumb" context based on Config to avoid triggering config.changed
context = dict(config())
if context["config_version"] == "v2":
template_config = "config_v2.toml"
else:
template_config = "config.toml"
endpoint = endpoint_from_flag("endpoint.containerd.available")
if endpoint:
sandbox_image = endpoint.get_sandbox_image()
if sandbox_image:
log("Setting sandbox_image to: {}".format(sandbox_image))
context["sandbox_image"] = sandbox_image
else:
context["sandbox_image"] = containerd.get_sandbox_image()
else:
context["sandbox_image"] = containerd.get_sandbox_image()
if not os.path.isdir(CONFIG_DIRECTORY):
os.mkdir(CONFIG_DIRECTORY)
# If custom_registries changed, make sure to remove old tls files.
if config().changed("custom_registries"):
old_custom_registries = config().previous("custom_registries")
else:
old_custom_registries = None
# validate custom_registries
invalid_reason = invalid_custom_registries(context["custom_registries"])
if invalid_reason:
status.blocked("Invalid custom_registries: {}".format(invalid_reason))
return
context["custom_registries"] = merge_custom_registries(
CONFIG_DIRECTORY, context["custom_registries"], old_custom_registries
)
untrusted = DB.get("untrusted")
if untrusted:
context["untrusted"] = True
context["untrusted_name"] = untrusted["name"]
context["untrusted_path"] = untrusted["binary_path"]
context["untrusted_binary"] = os.path.basename(untrusted["binary_path"])
else:
context["untrusted"] = False
if context.get("runtime") == "auto":
if is_state("containerd.nvidia.available"):
context["runtime"] = "nvidia-container-runtime"
else:
context["runtime"] = "runc"
render(template_config, os.path.join(CONFIG_DIRECTORY, CONFIG_FILE), context)
set_state("containerd.restart")
@when("containerd.installed")
@when("containerd.juju-proxy.changed")
@when_not("endpoint.containerd.departed")
def proxy_changed():
"""
Apply new proxy settings.
:return: None
"""
# Create "dumb" context based on Config
# to avoid triggering config.changed.
context = check_for_juju_https_proxy(config)
service_file = "proxy.conf"
service_directory = "/etc/systemd/system/containerd.service.d"
service_path = os.path.join(service_directory, service_file)
if context.get("http_proxy") or context.get("https_proxy") or context.get("no_proxy"):
os.makedirs(service_directory, exist_ok=True)
log("Proxy changed, writing new file to {}".format(service_path))
render(service_file, service_path, context)
else:
try:
log("Proxy cleaned, removing file {}".format(service_path))
os.remove(service_path)
except FileNotFoundError:
return # We don't need to restart the daemon.
DB.set("config-cache", context)
remove_state("containerd.juju-proxy.changed")
check_call(["systemctl", "daemon-reload"])
set_state("containerd.restart")
@when("containerd.restart")
@when_not("endpoint.containerd.departed")
def restart_containerd():
"""
Restart the containerd service.
If the restart fails, this function will log a message and be retried on
the next hook.
"""
status.maintenance("Restarting containerd")
if host.service_restart("containerd.service"):
remove_state("containerd.restart")
else:
log("Failed to restart containerd; will retry")
@when("containerd.ready")
@when("endpoint.containerd.joined")
@when_not("endpoint.containerd.departed")
def publish_config():
"""
Pass configuration to principal charm.
:return: None
"""
endpoint = endpoint_from_flag("endpoint.containerd.joined")
endpoint.set_config(
socket="unix:///var/run/containerd/containerd.sock",
runtime="remote", # TODO handle in k8s worker.
nvidia_enabled=is_state("containerd.nvidia.available"),
)
@when("endpoint.untrusted.available")
@when_not("untrusted.configured")
@when_not("endpoint.containerd.departed")
def untrusted_available():
"""
Handle untrusted container runtime.
:return: None
"""
untrusted_runtime = endpoint_from_flag("endpoint.untrusted.available")
received = dict(untrusted_runtime.get_config())
if "name" not in received.keys():
return # Try until config is available.
DB.set("untrusted", received)
config_changed()
set_state("untrusted.configured")
@when("endpoint.untrusted.departed")
def untrusted_departed():
"""
Handle untrusted container runtime.
:return: None
"""
DB.unset("untrusted")
DB.flush()
config_changed()
remove_state("untrusted.configured")
@when("endpoint.docker-registry.ready")
@when_not("containerd.registry.configured")
def configure_registry():
"""
Add docker registry config when present.
:return: None
"""
registry = endpoint_from_flag("endpoint.docker-registry.ready")
docker_registry = {
"host": strip_url(registry.registry_netloc),
"url": registry.registry_netloc,
}
# Handle auth data.
if registry.has_auth_basic():
docker_registry["username"] = registry.basic_user
docker_registry["password"] = registry.basic_password
# Handle TLS data.
if registry.has_tls():
# Ensure the CA that signed our registry cert is trusted.
host.install_ca_cert(registry.tls_ca, name="juju-docker-registry")
docker_registry["ca"] = str(ca_crt_path)
docker_registry["key"] = str(server_key_path)
docker_registry["cert"] = str(server_crt_path)
DB.set("registry", docker_registry)
config_changed()
set_state("containerd.registry.configured")
@when("endpoint.docker-registry.changed", "containerd.registry.configured")
def reconfigure_registry():
"""
Signal to update the registry config when something changes.
:return: None
"""
remove_state("containerd.registry.configured")
@when("endpoint.containerd.reconfigure")
@when_not("endpoint.containerd.departed")
def container_runtime_relation_changed():
"""
Run config_changed to use any new config from the endpoint.
:return: None
"""
config_changed()
endpoint = endpoint_from_flag("endpoint.containerd.reconfigure")
endpoint.handle_remote_config()
@when("containerd.registry.configured")
@when_not("endpoint.docker-registry.joined")
def remove_registry():
"""
Remove registry config when the registry is no longer present.
:return: None
"""
docker_registry = DB.get("registry", None)
if docker_registry:
# Remove from DB.
DB.unset("registry")
DB.flush()
# Remove auth-related data.
log("Disabling auth for docker registry: {}.".format(docker_registry["url"]))
config_changed()
remove_state("containerd.registry.configured")