#!/usr/local/sbin/charm-env python3

# Copyright 2015 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import base64
import json
import os
import re
import socket
import traceback
import yaml

from itertools import filterfalse
from shutil import move, copyfile
from pathlib import Path
from subprocess import check_call, call
from subprocess import check_output
from subprocess import CalledProcessError
from urllib.request import Request, urlopen

import charms.coordinator
from charms.layer import snap
from charms.leadership import leader_get, leader_set
from charms.reactive import hook
from charms.reactive import remove_state, clear_flag
from charms.reactive import get_flags, set_state, set_flag
from charms.reactive import is_state, is_flag_set, get_unset_flags
from charms.reactive import endpoint_from_flag, endpoint_from_name
from charms.reactive import when, when_any, when_not, when_none
from charms.reactive import register_trigger
from charms.reactive import data_changed, any_file_changed

from charms.layer import tls_client
from charms.layer import vaultlocker
from charms.layer import vault_kv

from charmhelpers.core import hookenv
from charmhelpers.core import host
from charmhelpers.core import unitdata
from charmhelpers.core.host import restart_on_change
from charmhelpers.core.host import (
    service_pause,
    service_resume,
    service_running,
    service_stop,
)
from charmhelpers.core.templating import render
from charmhelpers.contrib.charmsupport import nrpe
from charmhelpers.contrib.storage.linux.ceph import CephBrokerRq

from charms.layer import kubernetes_control_plane
from charms.layer import kubernetes_common

from charms.layer.kubernetes_common import kubeclientconfig_path
from charms.layer.kubernetes_common import migrate_resource_checksums
from charms.layer.kubernetes_common import check_resources_for_upgrade_needed
from charms.layer.kubernetes_common import (
    calculate_and_store_resource_checksums,
)  # noqa
from charms.layer.kubernetes_common import arch
from charms.layer.kubernetes_common import service_restart
from charms.layer.kubernetes_common import get_ingress_address
from charms.layer.kubernetes_common import get_ingress_address6
from charms.layer.kubernetes_common import create_kubeconfig
from charms.layer.kubernetes_common import get_service_ip
from charms.layer.kubernetes_common import configure_kubernetes_service
from charms.layer.kubernetes_common import cloud_config_path
from charms.layer.kubernetes_common import encryption_config_path
from charms.layer.kubernetes_common import write_gcp_snap_config
from charms.layer.kubernetes_common import generate_openstack_cloud_config
from charms.layer.kubernetes_common import write_azure_snap_config
from charms.layer.kubernetes_common import configure_kube_proxy
from charms.layer.kubernetes_common import kubeproxyconfig_path
from charms.layer.kubernetes_common import get_version
from charms.layer.kubernetes_common import retry
from charms.layer.kubernetes_common import ca_crt_path
from charms.layer.kubernetes_common import server_crt_path
from charms.layer.kubernetes_common import server_key_path
from charms.layer.kubernetes_common import client_crt_path
from charms.layer.kubernetes_common import client_key_path
from charms.layer.kubernetes_common import kubectl, kubectl_manifest, kubectl_success
from charms.layer.kubernetes_common import _get_vmware_uuid
from charms.layer.kubernetes_common import get_node_name
from charms.layer.kubernetes_common import get_sandbox_image_uri
from charms.layer.kubernetes_common import kubelet_kubeconfig_path

from charms.layer.kubernetes_node_base import LabelMaker

from charms.layer.nagios import install_nagios_plugin_from_file
from charms.layer.nagios import remove_nagios_plugin


# Override the default nagios shortname regex to allow periods, which we
# need because our bin names contain them (e.g. 'snap.foo.daemon'). The
# default regex in charmhelpers doesn't allow periods, but nagios itself does.
nrpe.Check.shortname_re = r"[\.A-Za-z0-9-_]+$"

snap_resources = [
    "kubectl",
    "kube-apiserver",
    "kube-controller-manager",
    "kube-scheduler",
    "cdk-addons",
    "kube-proxy",
    "kubelet",
]

control_plane_services = [
    "kube-apiserver",
    "kube-controller-manager",
    "kube-scheduler",
    "kube-proxy",
    "kubelet",
]

cohort_snaps = snap_resources


os.environ["PATH"] += os.pathsep + os.path.join(os.sep, "snap", "bin")
db = unitdata.kv()
checksum_prefix = "kubernetes-master.resource-checksums."
configure_prefix = "kubernetes-master.prev_args."
keystone_root = "/root/cdk/keystone"
keystone_policy_path = os.path.join(keystone_root, "keystone-policy.yaml")
kubecontrollermanagerconfig_path = "/root/cdk/kubecontrollermanagerconfig"
kubeschedulerconfig_path = "/root/cdk/kubeschedulerconfig"
cdk_addons_kubectl_config_path = "/root/cdk/cdk_addons_kubectl_config"
kubernetes_logs = "/var/log/kubernetes/"
aws_iam_webhook = "/root/cdk/aws-iam-webhook.yaml"
auth_webhook_root = "/root/cdk/auth-webhook"
auth_webhook_conf = os.path.join(auth_webhook_root, "auth-webhook-conf.yaml")
auth_webhook_exe = os.path.join(auth_webhook_root, "auth-webhook.py")
auth_webhook_svc_name = "cdk.master.auth-webhook"
auth_webhook_svc = "/etc/systemd/system/{}.service".format(auth_webhook_svc_name)
tls_ciphers_intermediate = [
    # https://wiki.mozilla.org/Security/Server_Side_TLS
    # https://ssl-config.mozilla.org/#server=go&config=intermediate
    "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256",
    "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256",
    "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384",
    "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384",
    "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305",
    "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305",
]


register_trigger(
    when="endpoint.aws.ready",
    set_flag="kubernetes-control-plane.aws.changed",  # when set
)
register_trigger(
    when_not="endpoint.aws.ready",  # when cleared
    set_flag="kubernetes-control-plane.aws.changed",
)
register_trigger(
    when="endpoint.azure.ready",
    set_flag="kubernetes-control-plane.azure.changed",  # when set
)
register_trigger(
    when_not="endpoint.azure.ready",  # when cleared
    set_flag="kubernetes-control-plane.azure.changed",
)
register_trigger(
    when="endpoint.gcp.ready",
    set_flag="kubernetes-control-plane.gcp.changed",  # when set
)
register_trigger(
    when_not="endpoint.gcp.ready",  # when cleared
    set_flag="kubernetes-control-plane.gcp.changed",
)
register_trigger(
    when="keystone-credentials.available", set_flag="cdk-addons.reconfigure"
)
register_trigger(
    when_not="keystone-credentials.available", set_flag="cdk-addons.reconfigure"
)
register_trigger(
    when="kubernetes-control-plane.aws.changed", set_flag="cdk-addons.reconfigure"
)
register_trigger(
    when="kubernetes-control-plane.azure.changed", set_flag="cdk-addons.reconfigure"
)
register_trigger(
    when="kubernetes-control-plane.gcp.changed", set_flag="cdk-addons.reconfigure"
)
register_trigger(
    when="kubernetes-control-plane.openstack.changed", set_flag="cdk-addons.reconfigure"
)
register_trigger(
    when_not="cni.available", clear_flag="kubernetes-control-plane.components.started"
)
register_trigger(
    when="kube-control.requests.changed", clear_flag="authentication.setup"
)
register_trigger(
    when_not="kubernetes-control-plane.apiserver.configured",
    clear_flag="kubernetes-control-plane.apiserver.running",
)
register_trigger(
    when="config.changed.image-registry",
    clear_flag="kubernetes-control-plane.kubelet.configured",
)
register_trigger(
    when="config.changed.image-registry",
    clear_flag="kubernetes-control-plane.sent-registry",
)
register_trigger(
    when="config.changed.default-cni",
    clear_flag="kubernetes-control-plane.default-cni.configured",
)
register_trigger(
    when_not="ceph-client.connected",
    clear_flag="kubernetes-control-plane.ceph.pools.created",
)
register_trigger(
    when_not="ceph-client.connected",
    clear_flag="kubernetes-control-plane.ceph.permissions.requested",
)
register_trigger(
    when="ceph-client.available",
    clear_flag="kubernetes-control-plane.apiserver.configured",
)
register_trigger(
    when_not="ceph-client.available",
    clear_flag="kubernetes-control-plane.apiserver.configured",
)


def set_upgrade_needed(forced=False):
    set_state("kubernetes-control-plane.upgrade-needed")
    config = hookenv.config()
    previous_channel = config.previous("channel")
    require_manual = config.get("require-manual-upgrade")
    hookenv.log("set upgrade needed")
    if previous_channel is None or not require_manual or forced:
        hookenv.log("forcing upgrade")
        set_state("kubernetes-control-plane.upgrade-specified")


@when("config.changed.channel")
def channel_changed():
    set_upgrade_needed()


def maybe_install_kubelet():
    if not snap.is_installed("kubelet"):
        channel = hookenv.config("channel")
        hookenv.status_set("maintenance", "Installing kubelet snap")
        snap.install("kubelet", channel=channel, classic=True)
        calculate_and_store_resource_checksums(checksum_prefix, snap_resources)


def maybe_install_kube_proxy():
    if not snap.is_installed("kube-proxy"):
        channel = hookenv.config("channel")
        hookenv.status_set("maintenance", "Installing kube-proxy snap")
        snap.install("kube-proxy", channel=channel, classic=True)
        calculate_and_store_resource_checksums(checksum_prefix, snap_resources)


@hook("install")
def fresh_install():
    # fresh installs should always send the unique cluster tag to cdk-addons
    set_state("kubernetes-control-plane.cdk-addons.unique-cluster-tag")


@hook("upgrade-charm")
def check_for_upgrade_needed():
    """An upgrade charm event was triggered by Juju, react to that here."""
    hookenv.status_set("maintenance", "Checking resources")
    is_leader = is_state("leadership.is_leader")

    # migrate to inclusive flags
    old, new = "kubernetes-master", "kubernetes-control-plane"  # wokeignore:rule=master
    for flag in get_flags():
        if flag.startswith(old):
            new_flag = flag.replace(old, new, 1)
            clear_flag(flag)
            set_flag(new_flag)

    # migrate to new flags
    if is_state("kubernetes-control-plane.restarted-for-cloud"):
        remove_state("kubernetes-control-plane.restarted-for-cloud")
        set_state("kubernetes-control-plane.cloud.ready")
    if is_state("kubernetes-control-plane.cloud-request-sent"):
        # minor change, just for consistency
        remove_state("kubernetes-control-plane.cloud-request-sent")
        set_state("kubernetes-control-plane.cloud.request-sent")
    if is_flag_set("kubernetes-control-plane.snaps.installed"):
        # consistent with layer-kubernetes-node-base
        remove_state("kubernetes-control-plane.snaps.installed")
        set_state("kubernetes-node.snaps.installed")

    # ceph-storage.configured flag no longer exists
    remove_state("ceph-storage.configured")

    # kubernetes-control-plane.ceph.configured flag no longer exists
    remove_state("kubernetes-control-plane.ceph.configured")

    maybe_install_kubelet()
    maybe_install_kube_proxy()
    update_certificates()
    switch_auth_mode(forced=True)

    # File-based auth is gone in 1.19; ensure any entries in basic_auth.csv are
    # added to known_tokens.csv, and any known_tokens entries are created as secrets.
    if not is_flag_set("kubernetes-control-plane.basic-auth.migrated"):
        if kubernetes_control_plane.migrate_auth_file(
            kubernetes_control_plane.AUTH_BASIC_FILE
        ):
            set_flag("kubernetes-control-plane.basic-auth.migrated")
        else:
            hookenv.log(
                "Unable to migrate {} to {}".format(
                    kubernetes_control_plane.AUTH_BASIC_FILE,
                    kubernetes_control_plane.AUTH_TOKENS_FILE,
                )
            )
    if not is_flag_set("kubernetes-control-plane.token-auth.migrated"):
        register_auth_webhook()
        add_rbac_roles()
        if kubernetes_control_plane.migrate_auth_file(
            kubernetes_control_plane.AUTH_TOKENS_FILE
        ):
            set_flag("kubernetes-control-plane.token-auth.migrated")
        else:
            hookenv.log(
                "Unable to migrate {} to Kubernetes secrets".format(
                    kubernetes_control_plane.AUTH_TOKENS_FILE
                )
            )
    set_state("reconfigure.authentication.setup")
    remove_state("authentication.setup")

    if not db.get("snap.resources.fingerprint.initialised"):
        # We are here on an upgrade from non-rolling control plane
        # Since this upgrade might also include resource updates eg
        # juju upgrade-charm kubernetes-control-plane --resource kube-any=my.snap
        # we take no risk and forcibly upgrade the snaps.
        # Forcibly means we do not prompt the user to call the upgrade action.
        set_upgrade_needed(forced=True)

    migrate_resource_checksums(checksum_prefix, snap_resources)
    if check_resources_for_upgrade_needed(checksum_prefix, snap_resources):
        set_upgrade_needed()

    # Set the auto storage backend to etcd2.
    auto_storage_backend = leader_get("auto_storage_backend")
    if not auto_storage_backend and is_leader:
        leader_set(auto_storage_backend="etcd2")

    if is_leader and not leader_get("auto_dns_provider"):
        was_kube_dns = hookenv.config().previous("enable-kube-dns")
        if was_kube_dns is True:
            leader_set(auto_dns_provider="kube-dns")
        elif was_kube_dns is False:
            leader_set(auto_dns_provider="none")

    if is_flag_set("nrpe-external-master.available"):
        update_nrpe_config()

    remove_state("kubernetes-control-plane.system-monitoring-rbac-role.applied")
    remove_state("kubernetes-control-plane.kubelet.configured")
    remove_state("kubernetes-control-plane.default-cni.configured")
    remove_state("kubernetes-control-plane.sent-registry")
    remove_state("kubernetes-control-plane.ceph.permissions.requested")

    # Remove services from hacluster and leave to systemd while
    # hacluster is not ready to accept order and colocation constraints
    if is_flag_set("ha.connected"):
        hacluster = endpoint_from_flag("ha.connected")
        for service in control_plane_services:
            daemon = "snap.{}.daemon".format(service)
            hacluster.remove_systemd_service(service, daemon)


@hook("pre-series-upgrade")
def pre_series_upgrade():
    """Stop the kubernetes control plane services"""
    for service in control_plane_services:
        service_pause("snap.%s.daemon" % service)


@hook("post-series-upgrade")
def post_series_upgrade():
    for service in control_plane_services:
        service_resume("snap.%s.daemon" % service)
    # set ourselves up to restart
    remove_state("kubernetes-control-plane.components.started")


@hook("leader-elected")
def leader_elected():
    clear_flag("authentication.setup")


def add_rbac_roles():
    """Update the known_tokens file with proper groups.

    DEPRECATED: Once known_tokens are migrated, group data will be stored in K8s
    secrets. Do not use this function after migrating to authn with secrets.
    """
    if is_flag_set("kubernetes-control-plane.token-auth.migrated"):
        hookenv.log("Known tokens have migrated to secrets. Skipping group changes")
        return
    tokens_fname = "/root/cdk/known_tokens.csv"
    tokens_backup_fname = "/root/cdk/known_tokens.csv.backup"
    move(tokens_fname, tokens_backup_fname)
    with open(tokens_fname, "w") as ftokens:
        with open(tokens_backup_fname, "r") as stream:
            for line in stream:
                if line.startswith("#"):
                    continue
                record = line.strip().split(",")
                try:
                    # valid line looks like: token,username,user,groups
                    if record[2] == "admin" and len(record) == 3:
                        towrite = '{0},{1},{2},"{3}"\n'.format(
                            record[0], record[1], record[2], "system:masters"
                        )
                        ftokens.write(towrite)
                        continue
                    if record[2] == "kube_proxy":
                        towrite = "{0},{1},{2}\n".format(
                            record[0], "system:kube-proxy", "kube-proxy"
                        )
                        ftokens.write(towrite)
                        continue
                    if record[2] == "kube_controller_manager":
                        towrite = "{0},{1},{2}\n".format(
                            record[0],
                            "system:kube-controller-manager",
                            "kube-controller-manager",
                        )
                        ftokens.write(towrite)
                        continue
                    if record[2] == "kubelet" and record[1] == "kubelet":
                        continue
                except IndexError:
                    msg = "Skipping invalid line from {}: {}".format(
                        tokens_backup_fname, line
                    )
                    hookenv.log(msg, level=hookenv.DEBUG)
                    continue
                else:
                    ftokens.write("{}".format(line))


@when("kubernetes-control-plane.upgrade-specified")
def do_upgrade():
    install_snaps()
    remove_state("kubernetes-control-plane.upgrade-needed")
    remove_state("kubernetes-control-plane.upgrade-specified")


def install_snaps():
    channel = hookenv.config("channel")
    hookenv.status_set("maintenance", "Installing core snap")
    snap.install("core")
    hookenv.status_set("maintenance", "Installing kubectl snap")
    snap.install("kubectl", channel=channel, classic=True)
    hookenv.status_set("maintenance", "Installing kube-apiserver snap")
    snap.install("kube-apiserver", channel=channel)
    hookenv.status_set("maintenance", "Installing kube-controller-manager snap")
    snap.install("kube-controller-manager", channel=channel)
    hookenv.status_set("maintenance", "Installing kube-scheduler snap")
    snap.install("kube-scheduler", channel=channel)
    hookenv.status_set("maintenance", "Installing cdk-addons snap")
    snap.install("cdk-addons", channel=channel)
    hookenv.status_set("maintenance", "Installing kubelet snap")
    snap.install("kubelet", channel=channel, classic=True)
    hookenv.status_set("maintenance", "Installing kube-proxy snap")
    snap.install("kube-proxy", channel=channel, classic=True)
    calculate_and_store_resource_checksums(checksum_prefix, snap_resources)
    db.set("snap.resources.fingerprint.initialised", True)
    set_state("kubernetes-node.snaps.installed")
    remove_state("kubernetes-control-plane.components.started")


@when("kubernetes-node.snaps.installed", "leadership.is_leader")
@when_not("leadership.set.cohort_keys")
def create_or_update_cohort_keys():
    cohort_keys = {}
    for snapname in cohort_snaps:
        try:
            cohort_key = snap.create_cohort_snapshot(snapname)
        except CalledProcessError:
            # Snap store outages prevent keys from being created; log it
            # and retry later. LP:1956608
            hookenv.log(
                "Failed to create cohort for {}; will retry".format(snapname),
                level=hookenv.INFO,
            )
            return
        cohort_keys[snapname] = cohort_key
    leader_set(cohort_keys=json.dumps(cohort_keys))
    hookenv.log("Snap cohort keys have been created.", level=hookenv.INFO)

    # Prime revision info so we can detect changes later
    cohort_revs = kubernetes_control_plane.get_snap_revs(cohort_snaps)
    data_changed("leader-cohort-revs", cohort_revs)
    hookenv.log(
        "Tracking cohort revisions: {}".format(cohort_revs), level=hookenv.DEBUG
    )


@when(
    "kubernetes-node.snaps.installed",
    "leadership.is_leader",
    "leadership.set.cohort_keys",
)
def check_cohort_updates():
    cohort_revs = kubernetes_control_plane.get_snap_revs(cohort_snaps)
    if cohort_revs and data_changed("leader-cohort-revs", cohort_revs):
        leader_set(cohort_keys=None)
        hookenv.log("Snap cohort revisions have changed.", level=hookenv.INFO)


@when("kubernetes-node.snaps.installed", "leadership.set.cohort_keys")
@when_none("coordinator.granted.cohort", "coordinator.requested.cohort")
def safely_join_cohort():
    """Coordinate the rollout of snap refreshes.

    When cohort keys change, grab a lock so that only 1 unit in the
    application joins the new cohort at a time. This allows us to roll out
    snap refreshes without risking all units going down at once.
    """
    cohort_keys = leader_get("cohort_keys")
    # NB: initial data-changed is always true
    if data_changed("leader-cohorts", cohort_keys):
        clear_flag("kubernetes-control-plane.cohorts.joined")
        clear_flag("kubernetes-control-plane.cohorts.sent")
        charms.coordinator.acquire("cohort")


@when(
    "kubernetes-node.snaps.installed",
    "leadership.set.cohort_keys",
    "coordinator.granted.cohort",
)
@when_not("kubernetes-control-plane.cohorts.joined")
def join_or_update_cohorts():
    """Join or update a cohort snapshot.

    All units of this application (leader and followers) need to refresh their
    installed snaps to the current cohort snapshot.
    """
    cohort_keys = json.loads(leader_get("cohort_keys"))
    for snapname in cohort_snaps:
        cohort_key = cohort_keys[snapname]
        if snap.is_installed(snapname):  # we also manage workers' cohorts
            hookenv.status_set("maintenance", "Joining snap cohort.")
            snap.join_cohort_snapshot(snapname, cohort_key)
    set_flag("kubernetes-control-plane.cohorts.joined")
    hookenv.log("{} has joined the snap cohort".format(hookenv.local_unit()))


@when(
    "kubernetes-node.snaps.installed",
    "leadership.set.cohort_keys",
    "kubernetes-control-plane.cohorts.joined",
    "kube-control.connected",
)
@when_not("kubernetes-control-plane.cohorts.sent")
def send_cohorts():
    """Send cohort information to workers.

    If we have peers, wait until all peers are updated before sending.
    Otherwise, we're a single unit k8s-cp and can fire when connected.
    """
    cohort_keys = json.loads(leader_get("cohort_keys"))
    kube_control = endpoint_from_flag("kube-control.connected")
    kube_cps = endpoint_from_flag("kube-masters.connected")  # wokeignore:rule=master

    # If we have peers, tell them we've joined the cohort. This is needed so
    # we don't tell workers about cohorts until all control planes are in-sync.
    goal_peers = len(list(hookenv.expected_peer_units()))
    if goal_peers > 0:
        if kube_cps:
            # tell peers about the cohort keys
            kube_cps.set_cohort_keys(cohort_keys)
        else:
            msg = "Waiting for {} peers before setting the cohort.".format(goal_peers)
            hookenv.log(msg, level=hookenv.DEBUG)
            return

        if is_flag_set("kube-masters.cohorts.ready"):
            # tell workers about the cohort keys
            kube_control.set_cohort_keys(cohort_keys)
            hookenv.log(
                "{} (peer) sent cohort keys to workers".format(hookenv.local_unit())
            )
        else:
            msg = "Waiting for k8s-cps to agree on cohorts."
            hookenv.log(msg, level=hookenv.DEBUG)
            return
    else:
        # tell workers about the cohort keys
        kube_control.set_cohort_keys(cohort_keys)
        hookenv.log(
            "{} (single) sent cohort keys to workers".format(hookenv.local_unit())
        )

    set_flag("kubernetes-control-plane.cohorts.sent")


@when("etcd.available")
@when("config.changed.enable-metrics")
def enable_metric_changed():
    """
    Trigger an api server update.

    :return: None
    """
    clear_flag("kubernetes-control-plane.apiserver.configured")

    if is_state("leadership.is_leader"):
        configure_cdk_addons()


@when("config.changed.client_password", "leadership.is_leader")
def password_changed():
    """Handle password change by reconfiguring authentication."""
    remove_state("authentication.setup")


@when("config.changed.storage-backend")
def storage_backend_changed():
    remove_state("kubernetes-control-plane.components.started")


@when("leadership.is_leader")
@when_not("authentication.setup")
def setup_leader_authentication():
    """
    Setup service accounts and tokens for the cluster.

    As of 1.19 charms, this will also propogate a generic basic_auth.csv, which is
    merged into known_tokens.csv, which are migrated to secrets during upgrade-charm.
    """
    basic_auth = "/root/cdk/basic_auth.csv"
    known_tokens = "/root/cdk/known_tokens.csv"
    service_key = "/root/cdk/serviceaccount.key"
    os.makedirs("/root/cdk", exist_ok=True)

    hookenv.status_set("maintenance", "Rendering authentication templates.")

    keys = [basic_auth, known_tokens, service_key]
    # Try first to fetch data from an old leadership broadcast.
    if not get_keys_from_leader(keys) or is_state("reconfigure.authentication.setup"):
        kubernetes_control_plane.deprecate_auth_file(basic_auth)
        set_flag("kubernetes-control-plane.basic-auth.migrated")

        kubernetes_control_plane.deprecate_auth_file(known_tokens)
        set_flag("kubernetes-control-plane.token-auth.migrated")

        # Generate the default service account token key
        if not os.path.isfile(service_key):
            cmd = ["openssl", "genrsa", "-out", service_key, "2048"]
            check_call(cmd)
        remove_state("reconfigure.authentication.setup")

    # Write the admin token every time we setup authn to ensure we honor a
    # configured password.
    client_pass = hookenv.config("client_password") or get_token("admin")
    setup_tokens(client_pass, "admin", "admin", "system:masters")

    create_tokens_and_sign_auth_requests()

    # send auth files to followers via leadership data
    leader_data = {}
    for f in [basic_auth, known_tokens, service_key]:
        try:
            with open(f, "r") as fp:
                leader_data[f] = fp.read()
        except FileNotFoundError:
            pass

    # this is slightly opaque, but we are sending file contents under its file
    # path as a key.
    # eg:
    # {'/root/cdk/serviceaccount.key': 'RSA:2471731...'}
    leader_set(leader_data)

    remove_state("kubernetes-control-plane.components.started")
    remove_state("kube-control.requests.changed")
    set_state("authentication.setup")


@when_not("leadership.is_leader")
def setup_non_leader_authentication():
    basic_auth = "/root/cdk/basic_auth.csv"
    known_tokens = "/root/cdk/known_tokens.csv"
    service_key = "/root/cdk/serviceaccount.key"

    # Starting with 1.19, we don't use csv auth files; handle changing secrets.
    secrets = {
        "admin": get_token("admin"),
        "kube-controller-manager": get_token("system:kube-controller-manager"),
        "kube-proxy": get_token("system:kube-proxy"),
        "kube-scheduler": get_token("system:kube-scheduler"),
    }
    if data_changed("secrets-data", secrets):
        set_flag("kubernetes-control-plane.token-auth.migrated")
        build_kubeconfig()
        remove_state("kubernetes-control-plane.components.started")

    keys = [basic_auth, known_tokens, service_key]
    # Pre-secrets, the source of truth for non-leaders is the leader.
    # Therefore we overwrite_local with whatever the leader has.
    if not get_keys_from_leader(keys, overwrite_local=True):
        # the keys were not retrieved. Non-leaders have to retry.
        return

    if any_file_changed(keys):
        remove_state("kubernetes-control-plane.components.started")

    # Clear stale creds from the kube-control relation so that the leader can
    # assume full control of them.
    kube_control = endpoint_from_flag("kube-control.connected")
    if kube_control:
        kube_control.clear_creds()

    remove_state("kube-control.requests.changed")
    set_state("authentication.setup")


def get_keys_from_leader(keys, overwrite_local=False):
    """
    Gets the broadcasted keys from the leader and stores them in
    the corresponding files.

    Args:
        keys: list of keys. Keys are actually files on the FS.

    Returns: True if all key were fetched, False if not.

    """
    # This races with other codepaths, and seems to require being created first
    # This block may be extracted later, but for now seems to work as intended
    os.makedirs("/root/cdk", exist_ok=True)

    for k in keys:
        # If the path does not exist, assume we need it
        if not os.path.exists(k) or overwrite_local:
            # Fetch data from leadership broadcast
            contents = leader_get(k)
            # Default to logging the warning and wait for leader data to be set
            if contents is None:
                hookenv.log("Missing content for file {}".format(k))
                return False
            # Write out the file and move on to the next item
            with open(k, "w+") as fp:
                fp.write(contents)
                fp.write("\n")

    return True


@when("kubernetes-node.snaps.installed")
def set_app_version():
    """Declare the application version to juju"""
    version = check_output(["kube-apiserver", "--version"])
    hookenv.application_version_set(version.split(b" v")[-1].rstrip())


@hookenv.atstart
def check_vault_pending():
    try:
        goal_state = hookenv.goal_state()
    except NotImplementedError:
        goal_state = {}
    vault_kv_goal = "vault-kv" in goal_state.get("relations", {})
    vault_kv_connected = is_state("vault-kv.connected")
    vault_kv_related = vault_kv_goal or vault_kv_connected
    vault_kv_ready = is_state("layer.vault-kv.ready")
    if vault_kv_related and not vault_kv_ready:
        set_flag("kubernetes-control-plane.vault-kv.pending")
    else:
        clear_flag("kubernetes-control-plane.vault-kv.pending")


@hookenv.atexit
def set_final_status():
    """Set the final status of the charm as we leave hook execution"""
    try:
        goal_state = hookenv.goal_state()
    except NotImplementedError:
        goal_state = {}

    if is_flag_set("upgrade.series.in-progress"):
        hookenv.status_set("blocked", "Series upgrade in progress")
        return

    if not is_flag_set("certificates.available"):
        if "certificates" in goal_state.get("relations", {}):
            hookenv.status_set("waiting", "Waiting for certificates authority.")
        else:
            hookenv.status_set("blocked", "Missing relation to certificate authority.")
        return

    if is_flag_set("kubernetes-control-plane.secure-storage.failed"):
        hookenv.status_set(
            "blocked",
            "Failed to configure encryption; "
            "secrets are unencrypted or inaccessible",
        )
        return
    elif is_flag_set("kubernetes-control-plane.secure-storage.created"):
        if not encryption_config_path().exists():
            hookenv.status_set(
                "blocked", "VaultLocker containing encryption config unavailable"
            )
            return

    vsphere_joined = is_state("endpoint.vsphere.joined")
    azure_joined = is_state("endpoint.azure.joined")
    cloud_blocked = is_state("kubernetes-control-plane.cloud.blocked")
    if vsphere_joined and cloud_blocked:
        hookenv.status_set(
            "blocked", "vSphere integration requires K8s 1.12 or greater"
        )
        return
    if azure_joined and cloud_blocked:
        hookenv.status_set("blocked", "Azure integration requires K8s 1.11 or greater")
        return
    if not is_flag_set("kubernetes.cni-plugins.installed"):
        hookenv.status_set("blocked", "Missing CNI resource")
        return
    if is_state("kubernetes-control-plane.cloud.pending"):
        hookenv.status_set("waiting", "Waiting for cloud integration")
        return

    if "kube-api-endpoint" in goal_state.get("relations", {}):
        if not is_state("kube-api-endpoint.available"):
            hookenv.status_set("waiting", "Waiting for kube-api-endpoint relation")
            return

    for lb_endpoint in ("loadbalancer-internal", "loadbalancer-external"):
        if lb_endpoint in goal_state.get("relations", {}):
            lb_provider = endpoint_from_name(lb_endpoint)
            if not lb_provider.has_response:
                hookenv.status_set("waiting", "Waiting for " + lb_endpoint)
                return

    if not is_state("kube-control.connected"):
        if "kube-control" in goal_state.get("relations", {}):
            status = "waiting"
        else:
            status = "blocked"
        hookenv.status_set(status, "Waiting for workers.")
        return

    ks = endpoint_from_flag("keystone-credentials.available")
    if ks and ks.api_version() == "2":
        msg = "Keystone auth v2 detected. v3 is required."
        hookenv.status_set("blocked", msg)
        return

    upgrade_needed = is_state("kubernetes-control-plane.upgrade-needed")
    upgrade_specified = is_state("kubernetes-control-plane.upgrade-specified")
    if upgrade_needed and not upgrade_specified:
        msg = "Needs manual upgrade, run the upgrade action"
        hookenv.status_set("blocked", msg)
        return

    try:
        get_dns_provider()
    except InvalidDnsProvider as e:
        if e.value == "core-dns":
            msg = "dns-provider=core-dns requires k8s 1.14+"
        else:
            msg = "dns-provider=%s is invalid" % e.value
        hookenv.status_set("blocked", msg)
        return

    if is_state("kubernetes-control-plane.vault-kv.pending"):
        hookenv.status_set(
            "waiting", "Waiting for encryption info from Vault to secure secrets"
        )
        return

    if is_state("kubernetes-control-plane.had-service-cidr-expanded"):
        hookenv.status_set(
            "waiting", "Waiting to retry updates for service-cidr expansion"
        )
        return

    if not is_state("etcd.available"):
        if "etcd" in goal_state.get("relations", {}):
            status = "waiting"
        else:
            status = "blocked"
        hookenv.status_set(status, "Waiting for etcd")
        return

    if not is_state("cni.available"):
        if "cni" in goal_state.get("relations", {}):
            status = "waiting"
        else:
            status = "blocked"
        hookenv.status_set(status, "Waiting for CNI plugins to become available")
        return

    if not is_state("tls_client.certs.saved"):
        hookenv.status_set("waiting", "Waiting for certificates")
        return

    if not is_flag_set("kubernetes-control-plane.auth-webhook-service.started"):
        hookenv.status_set("waiting", "Waiting for auth-webhook service to start")
        return

    if not is_flag_set("kubernetes-control-plane.apiserver.configured"):
        hookenv.status_set("waiting", "Waiting for API server to be configured")
        return

    if not is_flag_set("kubernetes-control-plane.apiserver.running"):
        hookenv.status_set("waiting", "Waiting for API server to start")
        return

    authentication_setup = is_state("authentication.setup")
    if not authentication_setup:
        hookenv.status_set("waiting", "Waiting for crypto keys.")
        return

    if not is_flag_set("kubernetes-control-plane.auth-webhook-tokens.setup"):
        hookenv.status_set("waiting", "Waiting for auth-webhook tokens")
        return

    if is_state("kubernetes-control-plane.components.started"):
        # All services should be up and running at this point. Double-check...
        failing_services = control_plane_services_down()
        if len(failing_services) != 0:
            msg = "Stopped services: {}".format(",".join(failing_services))
            hookenv.status_set("blocked", msg)
            if is_flag_set("ha.connected"):
                hookenv.log("Disabling node to pass resources to other nodes")
                cmd = "crm -w -F node standby"
                call(cmd.split())
            for service in failing_services:
                heal_handler = HEAL_HANDLER[service]
                for flag in heal_handler["clear_flags"]:
                    clear_flag(flag)
                heal_handler["run"]()
            set_flag("kubernetes-control-plane.components.failed")
            return
        else:
            if is_flag_set("kubernetes-control-plane.components.failed"):
                if is_flag_set("ha.connected"):
                    hookenv.log("Enabling node again to receive resources")
                    cmd = "crm -w -F node online"
                    call(cmd.split())
                clear_flag("kubernetes-control-plane.components.failed")

    else:
        # if we don't have components starting, we're waiting for that and
        # shouldn't fall through to Kubernetes control plane running.
        hookenv.status_set(
            "maintenance", "Waiting for control plane components to start"
        )
        return

    # Note that after this point, kubernetes-control-plane.components.started is
    # always True.

    is_leader = is_state("leadership.is_leader")
    addons_configured = is_state("cdk-addons.configured")
    if is_leader and not addons_configured:
        hookenv.status_set("waiting", "Waiting to retry addon deployment")
        return

    if is_leader and not is_state(
        "kubernetes-control-plane.system-monitoring-rbac-role.applied"
    ):
        msg = "Waiting to retry applying system:monitoring RBAC role"
        hookenv.status_set("waiting", msg)
        return

    try:
        unready = get_kube_system_pods_not_running()
    except FailedToGetPodStatus:
        hookenv.status_set("waiting", "Waiting for kube-system pods to start")
        return

    if unready:
        plural = "s" if len(unready) > 1 else ""
        msg = "Waiting for {} kube-system pod{} to start"
        msg = msg.format(len(unready), plural)
        hookenv.status_set("waiting", msg)
        return

    service_cidr = kubernetes_control_plane.service_cidr()
    if hookenv.config("service-cidr") != service_cidr:
        msg = "WARN: cannot change service-cidr, still using " + service_cidr
        hookenv.status_set("active", msg)
        return

    gpu_available = is_state("kube-control.gpu.available")
    gpu_enabled = is_state("kubernetes-control-plane.gpu.enabled")
    if gpu_available and not gpu_enabled:
        msg = 'GPUs available. Set allow-privileged="auto" to enable.'
        hookenv.status_set("active", msg)
        return

    if is_flag_set("ceph-storage.available"):
        hookenv.status_set(
            "blocked", "ceph-storage relation deprecated, use ceph-client instead"
        )
        return

    if is_flag_set("ceph-client.connected") and not is_flag_set(
        "ceph-client.available"
    ):
        hookenv.status_set("waiting", "Waiting for Ceph to provide a key.")
        return

    if (
        is_leader
        and ks
        and is_flag_set("kubernetes-control-plane.keystone-policy-error")
    ):
        hookenv.status_set("blocked", "Invalid keystone policy file.")
        return

    if (
        is_leader
        and ks
        and not is_flag_set("kubernetes-control-plane.keystone-policy-handled")
    ):
        hookenv.status_set("waiting", "Waiting to apply keystone policy file.")
        return

    if hookenv.config("enable-metrics") and not hookenv.config(
        "api-aggregation-extension"
    ):
        hookenv.status_set(
            "blocked",
            "metrics service will be unreachable without api-aggregation-extension.",
        )
        return

    hookenv.status_set("active", "Kubernetes control-plane running.")


def control_plane_services_down():
    """Ensure control plane services are up and running.

    Return: list of failing services"""
    return list(
        filterfalse(kubernetes_control_plane.check_service, control_plane_services)
    )


def add_systemd_file_limit():
    directory = "/etc/systemd/system/snap.kube-apiserver.daemon.service.d"
    if not os.path.isdir(directory):
        os.makedirs(directory)

    file_name = "file-limit.conf"
    path = os.path.join(directory, file_name)
    if not os.path.isfile(path):
        with open(path, "w") as f:
            f.write("[Service]\n")
            f.write("LimitNOFILE=65535")


def add_systemd_restart_always():
    template = "templates/service-always-restart.systemd-latest.conf"

    try:
        # Get the systemd version
        cmd = ["systemd", "--version"]
        output = check_output(cmd).decode("UTF-8")
        line = output.splitlines()[0]
        words = line.split()
        assert words[0] == "systemd"
        systemd_version = int(words[1])

        # Check for old version (for xenial support)
        if systemd_version < 230:
            template = "templates/service-always-restart.systemd-229.conf"
    except Exception:
        traceback.print_exc()
        hookenv.log(
            "Failed to detect systemd version, using latest template", level="ERROR"
        )

    for service in control_plane_services:
        dest_dir = "/etc/systemd/system/snap.{}.daemon.service.d".format(service)
        os.makedirs(dest_dir, exist_ok=True)
        copyfile(template, "{}/always-restart.conf".format(dest_dir))


def add_systemd_file_watcher():
    """Setup systemd file-watcher service.

    This service watches these files for changes:

    /root/cdk/known_tokens.csv
    /root/cdk/serviceaccount.key

    If a file is changed, the service uses juju-run to invoke a script in a
    hook context on this unit. If this unit is the leader, the script will
    call leader-set to distribute the contents of these files to the
    non-leaders so they can sync their local copies to match.

    """
    render(
        "cdk.master.leader.file-watcher.sh",
        "/usr/local/sbin/cdk.master.leader.file-watcher.sh",
        {},
        perms=0o777,
    )
    render(
        "cdk.master.leader.file-watcher.service",
        "/etc/systemd/system/cdk.master.leader.file-watcher.service",
        {"unit": hookenv.local_unit()},
        perms=0o644,
    )
    render(
        "cdk.master.leader.file-watcher.path",
        "/etc/systemd/system/cdk.master.leader.file-watcher.path",
        {},
        perms=0o644,
    )
    service_resume("cdk.master.leader.file-watcher.path")


@when("etcd.available", "tls_client.certs.saved")
@restart_on_change(
    {
        auth_webhook_conf: [auth_webhook_svc_name],
        auth_webhook_exe: [auth_webhook_svc_name],
        auth_webhook_svc: [auth_webhook_svc_name],
    }
)
def register_auth_webhook():
    """Render auth webhook templates and start the related service."""
    Path(auth_webhook_root).mkdir(exist_ok=True)

    # For 'api_ver', match the api version of the authentication.k8s.io TokenReview
    # that k8s-apiserver will be sending:
    #   https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18
    context = {
        "api_ver": "v1beta1",
        "charm_dir": hookenv.charm_dir(),
        "host": get_ingress_address(
            "kube-api-endpoint", ignore_addresses=[hookenv.config("ha-cluster-vip")]
        ),
        "pidfile": "{}.pid".format(auth_webhook_svc_name),
        "logfile": "{}.log".format(auth_webhook_svc_name),
        "port": 5000,
        "root_dir": auth_webhook_root,
    }

    context["aws_iam_endpoint"] = None
    if endpoint_from_flag("endpoint.aws-iam.ready"):
        aws_webhook = Path(aws_iam_webhook)
        if aws_webhook.exists():
            aws_yaml = yaml.safe_load(aws_webhook.read_text())
            try:
                context["aws_iam_endpoint"] = aws_yaml["clusters"][0]["cluster"][
                    "server"
                ]
            except (KeyError, TypeError):
                hookenv.log(
                    "Unable to find server in AWS IAM webhook: {}".format(aws_yaml)
                )
                pass

    context["keystone_endpoint"] = None
    if endpoint_from_flag("keystone-credentials.available"):
        ks_webhook = Path(keystone_root) / "webhook.yaml"
        if ks_webhook.exists():
            ks_yaml = yaml.safe_load(ks_webhook.read_text())
            try:
                context["keystone_endpoint"] = ks_yaml["clusters"][0]["cluster"][
                    "server"
                ]
            except (KeyError, TypeError):
                hookenv.log(
                    "Unable to find server in Keystone webhook: {}".format(ks_yaml)
                )
                pass

    context["custom_authn_endpoint"] = None
    custom_authn = hookenv.config("authn-webhook-endpoint")
    if custom_authn:
        context["custom_authn_endpoint"] = custom_authn

    k8s_log_path = Path(kubernetes_logs)
    k8s_log_path.mkdir(parents=True, exist_ok=True)  # ensure log path exists
    render("cdk.master.auth-webhook-conf.yaml", auth_webhook_conf, context)
    render("cdk.master.auth-webhook.py", auth_webhook_exe, context)
    render(
        "cdk.master.auth-webhook.logrotate", "/etc/logrotate.d/auth-webhook", context
    )

    # Move existing log files from ${auth_webhook_root} to /var/log/kubernetes/
    for log_file in Path(auth_webhook_root).glob("auth-webhook.log*"):
        # all historical log files (.log, .log.1 and .log.3.tgz)
        new_log_file = k8s_log_path / ("cdk.master." + log_file.name)
        if not new_log_file.exists():
            move(str(log_file), str(new_log_file))

    # Set the number of gunicorn workers based on our core count. (2*cores)+1 is
    # recommended: https://docs.gunicorn.org/en/stable/design.html#how-many-workers
    try:
        cores = int(check_output(["nproc"]).decode("utf-8").strip())
    except CalledProcessError:
        # Our default architecture is 2-cores for k8s-cp units
        cores = 2
    else:
        # Put an upper bound on cores; more than 12ish workers is overkill
        cores = 6 if cores > 6 else cores
    context["num_workers"] = cores * 2 + 1
    render("cdk.master.auth-webhook.service", auth_webhook_svc, context)
    if any_file_changed([auth_webhook_svc]):
        # if the service file has changed (or is new),
        # we have to inform systemd about it
        check_call(["systemctl", "daemon-reload"])
    if not is_flag_set("kubernetes-control-plane.auth-webhook-service.started"):
        if service_resume(auth_webhook_svc_name):
            set_flag("kubernetes-control-plane.auth-webhook-service.started")
            clear_flag("kubernetes-control-plane.apiserver.configured")
        else:
            hookenv.status_set(
                "maintenance", "Waiting for {} to start.".format(auth_webhook_svc_name)
            )
            hookenv.log("{} failed to start; will retry".format(auth_webhook_svc_name))


@when(
    "kubernetes-control-plane.apiserver.running",
    "kubernetes-control-plane.auth-webhook-service.started",
    "authentication.setup",
)
@when_not("kubernetes-control-plane.auth-webhook-tokens.setup")
def setup_auth_webhook_tokens():
    """Reconfigure authentication to setup auth-webhook tokens.

    If authentication has been setup with a non-auth-webhook configuration,
    convert it to use auth-webhook tokens instead. Alternatively, if the
    auth-webhook setup failed, this will also ensure that it is retried.
    """
    # Even if the apiserver is configured, it may not be fully started. Only
    # proceed if we can get secrets.
    if not kubectl_success("get", "secrets"):
        hookenv.log("Secrets are not yet available; will retry")
        return
    if create_tokens_and_sign_auth_requests():
        # Force setup_leader_authentication to be re-run.
        remove_state("authentication.setup")


@when(
    "etcd.available",
    "tls_client.certs.saved",
    "authentication.setup",
    "leadership.set.auto_storage_backend",
    "leadership.set.cluster_tag",
    "cni.available",
)
@when_not(
    "kubernetes-control-plane.components.started",
    "kubernetes-control-plane.cloud.pending",
    "kubernetes-control-plane.cloud.blocked",
    "kubernetes-control-plane.vault-kv.pending",
    "tls_client.certs.changed",
    "tls_client.ca.written",
    "upgrade.series.in-progress",
)
def start_control_plane():
    """Run the Kubernetes control-plane components."""
    hookenv.status_set(
        "maintenance", "Configuring the Kubernetes control plane services."
    )

    if not is_state("kubernetes-control-plane.vault-kv.pending") and not is_state(
        "kubernetes-control-plane.secure-storage.created"
    ):
        encryption_config_path().parent.mkdir(parents=True, exist_ok=True)
        host.write_file(
            path=str(encryption_config_path()),
            perms=0o600,
            content=yaml.safe_dump(
                {
                    "kind": "EncryptionConfig",
                    "apiVersion": "v1",
                    "resources": [
                        {"resources": ["secrets"], "providers": [{"identity": {}}]}
                    ],
                }
            ),
        )

    kubernetes_control_plane.freeze_service_cidr()

    etcd = endpoint_from_flag("etcd.available")
    if not etcd.get_connection_string():
        # etcd is not returning a connection string. This happens when
        # the control-plane unit disconnects from etcd and is ready to terminate.
        # No point in trying to start control-plane services and fail. Just return.
        return

    # TODO: Make sure below relation is handled on change
    # https://github.com/kubernetes/kubernetes/issues/43461
    handle_etcd_relation(etcd)

    # Set up additional systemd services
    add_systemd_restart_always()
    add_systemd_file_limit()
    add_systemd_file_watcher()
    add_systemd_iptables_patch()
    check_call(["systemctl", "daemon-reload"])

    # Add CLI options to all components
    clear_flag("kubernetes-control-plane.apiserver.configured")
    configure_controller_manager()
    configure_scheduler()

    # kube-proxy
    cluster_cidr = kubernetes_common.cluster_cidr()
    if kubernetes_common.is_ipv6(cluster_cidr):
        kubernetes_common.enable_ipv6_forwarding()

    local_address = get_ingress_address("kube-api-endpoint")
    local_server = "https://{0}:{1}".format(local_address, 6443)

    configure_kube_proxy(configure_prefix, [local_server], cluster_cidr)
    service_restart("snap.kube-proxy.daemon")

    set_state("kubernetes-control-plane.components.started")
    hookenv.open_port(6443)


@when("config.changed.proxy-extra-args")
def proxy_args_changed():
    clear_flag("kubernetes-control-plane.components.started")
    clear_flag("config.changed.proxy-extra-args")


@when("tls_client.certs.changed")
def certs_changed():
    if service_running(auth_webhook_svc_name):
        service_restart(auth_webhook_svc_name)
    clear_flag("kubernetes-control-plane.components.started")
    clear_flag("tls_client.certs.changed")


@when("tls_client.ca.written")
def ca_written():
    clear_flag("kubernetes-control-plane.components.started")
    if is_state("leadership.is_leader"):
        if leader_get("kubernetes-master-addons-ca-in-use"):
            leader_set({"kubernetes-master-addons-restart-for-ca": True})
    clear_flag("tls_client.ca.written")
    clear_flag("kubernetes-control-plane.kubelet.configured")


@when("etcd.available")
def etcd_data_change(etcd):
    """Etcd scale events block control-plane reconfiguration due to the
    kubernetes-control-plane.components.started state. We need a way to
    handle these events consistently only when the number of etcd
    units has actually changed"""

    # key off of the connection string
    connection_string = etcd.get_connection_string()

    # If the connection string changes, remove the started state to trigger
    # handling of the control-plane components
    if data_changed("etcd-connect", connection_string):
        remove_state("kubernetes-control-plane.components.started")

    # If the cert info changes, remove the started state to trigger
    # handling of the control-plane components
    if data_changed("etcd-certs", etcd.get_client_credentials()):
        clear_flag("kubernetes-control-plane.components.started")

    # We are the leader and the auto_storage_backend is not set meaning
    # this is the first time we connect to etcd.
    auto_storage_backend = leader_get("auto_storage_backend")
    is_leader = is_state("leadership.is_leader")
    if is_leader and not auto_storage_backend:
        if etcd.get_version().startswith("3."):
            leader_set(auto_storage_backend="etcd3")
        else:
            leader_set(auto_storage_backend="etcd2")


def get_dns_info():
    dns_provider = endpoint_from_flag("dns-provider.available")
    try:
        goal_state_rels = hookenv.goal_state().get("relations", {})
    except NotImplementedError:
        goal_state_rels = {}
    dns_provider_missing = not dns_provider and "dns-provider" not in goal_state_rels
    dns_provider_pending = not dns_provider and "dns-provider" in goal_state_rels
    try:
        dns_disabled_cfg = get_dns_provider() == "none"
    except InvalidDnsProvider:
        dns_disabled_cfg = False
    if dns_provider_missing and dns_disabled_cfg:
        return True, None, None, None
    elif dns_provider_pending:
        return False, None, None, None
    elif dns_provider:
        details = dns_provider.details()
        return True, details["sdn-ip"], details["port"], details["domain"]
    else:
        try:
            dns_provider = get_dns_provider()
        except InvalidDnsProvider:
            hookenv.log(traceback.format_exc())
            return False, None, None, None
        dns_domain = hookenv.config("dns_domain")
        dns_ip = None
        try:
            dns_ip = kubernetes_control_plane.get_dns_ip()
        except CalledProcessError:
            hookenv.log("DNS addon service not ready yet")
            return False, None, None, None
        return True, dns_ip, 53, dns_domain


@when("kube-control.connected")
@when("cdk-addons.configured")
def send_cluster_dns_detail(kube_control):
    """Send cluster DNS info"""
    dns_ready, dns_ip, dns_port, dns_domain = get_dns_info()
    if dns_ready:
        kube_control.set_dns(dns_port, dns_domain, dns_ip, dns_ip is not None)


def create_tokens_and_sign_auth_requests():
    """Create tokens for CK users and services."""
    clear_flag("kubernetes-control-plane.auth-webhook-tokens.setup")
    # NB: This may be called before kube-apiserver is up when bootstrapping new
    # clusters with auth-webhook. In this case, setup_tokens will be a no-op.
    # We will re-enter this function once control plane services are available to
    # create proper secrets.
    controller_manager_token = get_token("system:kube-controller-manager")
    if not controller_manager_token:
        setup_tokens(None, "system:kube-controller-manager", "kube-controller-manager")

    proxy_token = get_token("system:kube-proxy")
    if not proxy_token:
        setup_tokens(None, "system:kube-proxy", "kube-proxy")
        proxy_token = get_token("system:kube-proxy")

    scheduler_token = get_token("system:kube-scheduler")
    if not scheduler_token:
        setup_tokens(None, "system:kube-scheduler", "system:kube-scheduler")

    client_token = get_token("admin")
    if not client_token:
        setup_tokens(None, "admin", "admin", "system:masters")
        client_token = get_token("admin")

    monitoring_token = get_token("system:monitoring")
    if not monitoring_token:
        setup_tokens(None, "system:monitoring", "system:monitoring")

    if not (proxy_token and client_token):
        # When bootstrapping a new cluster, we may not have all our secrets yet.
        # Do not let the kubelets start without all the needed tokens.
        hookenv.log(
            "Missing required tokens for kubelet startup; will retry", hookenv.WARNING
        )
        return False

    kube_control = endpoint_from_flag("kube-control.connected")
    requests = kube_control.auth_user() if kube_control else []
    any_failed = False
    for request in requests:
        username = request[1]["user"]
        group = request[1]["group"]
        if not username or not group:
            continue
        kubelet_token = get_token(username)
        if not kubelet_token:
            # Username will be in the form of system:node:<nodeName>.
            # User ID will be a worker <unitName>, and while not used today, we store
            # this in case it becomes useful to map a secret to a unit in the future.
            userid = request[0]
            setup_tokens(None, username, userid, group)
            kubelet_token = get_token(username)
        if not kubelet_token:
            hookenv.log(
                "Failed to create token for {}; will retry".format(username),
                hookenv.WARNING,
            )
            any_failed = True
            continue
        kube_control.sign_auth_request(
            request[0], username, kubelet_token, proxy_token, client_token
        )
    if not any_failed:
        set_flag("kubernetes-control-plane.auth-webhook-tokens.setup")
        return True
    else:
        return False


@when("kube-api-endpoint.available")
def push_service_data():
    """Send configuration to the load balancer, and close access to the
    public interface.
    """
    kube_api = endpoint_from_flag("kube-api-endpoint.available")

    endpoints = kubernetes_control_plane.get_endpoints_from_config()
    if endpoints:
        addresses = [e[0] for e in endpoints]
        kube_api.configure(
            kubernetes_control_plane.STANDARD_API_PORT, addresses, addresses
        )
    else:
        # no manually configured LBs, so rely on the interface layer
        # to use the ingress address for each relation
        kube_api.configure(kubernetes_control_plane.STANDARD_API_PORT)


@when("leadership.is_leader")
@when_any(
    "endpoint.loadbalancer-internal.available",
    "endpoint.loadbalancer-external.available",
)
def request_load_balancers():
    """Request LBs from the related provider(s)."""
    for lb_type in ("internal", "external"):
        lb_provider = endpoint_from_name("loadbalancer-" + lb_type)
        if not lb_provider.is_available:
            continue
        req = lb_provider.get_request("api-server-" + lb_type)
        req.protocol = req.protocols.tcp
        ext_api_port = kubernetes_control_plane.EXTERNAL_API_PORT
        int_api_port = kubernetes_control_plane.STANDARD_API_PORT
        api_port = ext_api_port if lb_type == "external" else int_api_port
        req.port_mapping = {api_port: int_api_port}
        req.public = lb_type == "external"
        if not req.health_checks:
            req.add_health_check(
                protocol=req.protocols.http,
                port=int_api_port,
                path="/livez",
            )
        lb_provider.send_request(req)


@when("kube-control.connected")
def send_api_urls():
    kube_control = endpoint_from_name("kube-control")
    if not hasattr(kube_control, "set_api_endpoints"):
        # built with an old version of the kube-control interface
        # the old kube-api-endpoint relation must be used instead
        return
    endpoints = kubernetes_control_plane.get_internal_api_endpoints()
    if not endpoints:
        return
    kube_control.set_api_endpoints(kubernetes_control_plane.get_api_urls(endpoints))


def has_external_cloud_provider():
    return bool(hookenv.relations().get("external-cloud-provider"))


@when("kube-control.connected")
def send_xcp_flag():
    has_xcp = has_external_cloud_provider()
    kube_control = endpoint_from_name("kube-control")
    kube_control.set_has_xcp(has_xcp)


@when("certificates.available", "cni.available")
def send_data():
    """Send the data that is required to create a server certificate for
    this server."""
    # Use the public ip of this unit as the Common Name for the certificate.
    common_name = hookenv.unit_public_ip()

    # Get the SDN gateways based on the service CIDRs.
    k8s_service_ips = kubernetes_control_plane.get_kubernetes_service_ips()

    cluster_cidr = kubernetes_common.cluster_cidr()
    bind_ips = kubernetes_common.get_bind_addrs(
        ipv4=kubernetes_common.is_ipv4(cluster_cidr),
        ipv6=kubernetes_common.is_ipv6(cluster_cidr),
    )

    # Get ingress address (this is probably already covered by bind_ips,
    # but list it explicitly as well just in case it's not).
    old_ingress_ip = get_ingress_address("kube-api-endpoint")
    new_ingress_ip = get_ingress_address("kube-control")

    local_endpoint = kubernetes_control_plane.get_local_api_endpoint()[0][0]

    domain = hookenv.config("dns_domain")
    # Create SANs that the tls layer will add to the server cert.
    sans = (
        [
            # The CN field is checked as a hostname, so if it's an IP, it
            # won't match unless also included in the SANs as an IP field.
            common_name,
            local_endpoint,
            old_ingress_ip,
            new_ingress_ip,
            socket.gethostname(),
            socket.getfqdn(),
            "kubernetes",
            "kubernetes.{0}".format(domain),
            "kubernetes.default",
            "kubernetes.default.svc",
            "kubernetes.default.svc.{0}".format(domain),
        ]
        + k8s_service_ips
        + bind_ips
    )

    sans.extend(e[0] for e in kubernetes_control_plane.get_internal_api_endpoints())
    sans.extend(e[0] for e in kubernetes_control_plane.get_external_api_endpoints())

    # maybe they have extra names they want as SANs
    extra_sans = hookenv.config("extra_sans")
    if extra_sans and not extra_sans == "":
        sans.extend(extra_sans.split())

    # Request a server cert with this information.
    tls_client.request_server_cert(
        common_name,
        sorted(set(sans)),
        crt_path=server_crt_path,
        key_path=server_key_path,
    )

    # Request a client cert for kubelet.
    tls_client.request_client_cert(
        "system:kube-apiserver", crt_path=client_crt_path, key_path=client_key_path
    )


@when(
    "config.changed.extra_sans", "certificates.available", "kube-api-endpoint.available"
)
def update_certificates():
    # NOTE: This handler may be called by another function. Two relationships
    # are required, otherwise the send_data function fails.
    # (until the relations are available)
    missing_relations = get_unset_flags(
        "certificates.available", "kube-api-endpoint.available"
    )
    if missing_relations:
        hookenv.log(
            "Missing relations: '{}'".format(", ".join(missing_relations)),
            hookenv.ERROR,
        )
        return

    # Using the config.changed.extra_sans flag to catch changes.
    # IP changes will take ~5 minutes or so to propagate, but
    # it will update.
    send_data()
    clear_flag("config.changed.extra_sans")


@when(
    "kubernetes-control-plane.components.started",
    "leadership.is_leader",
    "cdk-addons.reconfigure",
)
def reconfigure_cdk_addons():
    configure_cdk_addons()


@when(
    "kubernetes-control-plane.components.started",
    "leadership.is_leader",
    "leadership.set.cluster_tag",
)
@when_not("upgrade.series.in-progress")
def configure_cdk_addons():
    """Configure CDK addons"""
    remove_state("cdk-addons.reconfigure")
    remove_state("cdk-addons.configured")
    remove_state("kubernetes-control-plane.aws.changed")
    remove_state("kubernetes-control-plane.azure.changed")
    remove_state("kubernetes-control-plane.gcp.changed")
    remove_state("kubernetes-control-plane.openstack.changed")
    load_gpu_plugin = hookenv.config("enable-nvidia-plugin").lower()
    gpuEnable = (
        get_version("kube-apiserver") >= (1, 9)
        and load_gpu_plugin == "auto"
        and is_state("kubernetes-control-plane.gpu.enabled")
    )
    registry = hookenv.config("image-registry")
    dbEnabled = str(hookenv.config("enable-dashboard-addons")).lower()
    try:
        dnsProvider = get_dns_provider()
    except InvalidDnsProvider:
        hookenv.log(traceback.format_exc())
        return
    metricsEnabled = str(hookenv.config("enable-metrics")).lower()
    default_storage = ""
    ceph = {}
    ceph_ep = endpoint_from_flag("ceph-client.available")
    cephfs_mounter = hookenv.config("cephfs-mounter")
    cephEnabled = "false"
    cephFsEnabled = "false"
    if ceph_ep and ceph_ep.key and ceph_ep.mon_hosts():
        kubernetes_control_plane.install_ceph_common()
        ceph_fsid = kubernetes_control_plane.get_ceph_fsid()
        if ceph_fsid:
            cephEnabled = "true"
            b64_ceph_key = base64.b64encode(ceph_ep.key.encode("utf-8"))
            ceph["admin_key"] = b64_ceph_key.decode("ascii")
            ceph["fsid"] = ceph_fsid
            ceph["kubernetes_key"] = b64_ceph_key.decode("ascii")
            ceph["mon_hosts"] = " ".join(ceph_ep.mon_hosts())
            default_storage = hookenv.config("default-storage")

            if kubernetes_control_plane.query_cephfs_enabled():
                cephFsEnabled = "true"
                ceph["fsname"] = kubernetes_control_plane.get_cephfs_fsname() or ""

    keystone = {}
    ks = endpoint_from_flag("keystone-credentials.available")
    if ks:
        keystoneEnabled = "true"
        keystone["cert"] = "/root/cdk/server.crt"
        keystone["key"] = "/root/cdk/server.key"
        keystone["url"] = "{}://{}:{}/v{}".format(
            ks.credentials_protocol(),
            ks.credentials_host(),
            ks.credentials_port(),
            ks.api_version(),
        )
        keystone["keystone-ca"] = hookenv.config("keystone-ssl-ca")
    else:
        keystoneEnabled = "false"

    enable_aws = str(is_flag_set("endpoint.aws.ready")).lower()
    enable_azure = str(is_flag_set("endpoint.azure.ready")).lower()
    enable_gcp = str(is_flag_set("endpoint.gcp.ready")).lower()
    enable_openstack = str(is_flag_set("endpoint.openstack.ready")).lower()
    openstack = endpoint_from_flag("endpoint.openstack.ready")

    if is_state("kubernetes-control-plane.cdk-addons.unique-cluster-tag"):
        cluster_tag = leader_get("cluster_tag")
    else:
        # allow for older upgraded charms to control when they start sending
        # the unique cluster tag to cdk-addons
        cluster_tag = "kubernetes"

    args = [
        "kubeconfig=" + cdk_addons_kubectl_config_path,
        "arch=" + arch(),
        "dns-domain=" + hookenv.config("dns_domain"),
        "registry=" + registry,
        "enable-dashboard=" + dbEnabled,
        "enable-metrics=" + metricsEnabled,
        "enable-gpu=" + str(gpuEnable).lower(),
        "enable-ceph=" + cephEnabled,
        "enable-cephfs=" + cephFsEnabled,
        "cephfs-mounter=" + cephfs_mounter,
        "ceph-admin-key=" + (ceph.get("admin_key", "")),
        "ceph-fsid=" + (ceph.get("fsid", "")),
        "ceph-fsname=" + (ceph.get("fsname", "")),
        "ceph-kubernetes-key=" + (ceph.get("admin_key", "")),
        'ceph-mon-hosts="' + (ceph.get("mon_hosts", "")) + '"',
        "ceph-user=" + hookenv.application_name(),
        "default-storage=" + default_storage,
        "enable-keystone=" + keystoneEnabled,
        "keystone-cert-file=" + keystone.get("cert", ""),
        "keystone-key-file=" + keystone.get("key", ""),
        "keystone-server-url=" + keystone.get("url", ""),
        "keystone-server-ca=" + keystone.get("keystone-ca", ""),
        "dashboard-auth=token",
        "enable-aws=" + enable_aws,
        "enable-azure=" + enable_azure,
        "enable-gcp=" + enable_gcp,
        "enable-openstack=" + enable_openstack,
        "cluster-tag=" + cluster_tag,
    ]
    if openstack:
        args.extend(
            [
                "openstack-cloud-conf="
                + base64.b64encode(
                    generate_openstack_cloud_config().encode("utf-8")
                ).decode("utf-8"),
                "openstack-endpoint-ca=" + (openstack.endpoint_tls_ca or ""),
            ]
        )
    if get_version("kube-apiserver") >= (1, 14):
        args.append("dns-provider=" + dnsProvider)
    else:
        enableKubeDNS = dnsProvider == "kube-dns"
        args.append("enable-kube-dns=" + str(enableKubeDNS).lower())
    check_call(["snap", "set", "cdk-addons"] + args)
    if not addons_ready():
        remove_state("cdk-addons.configured")
        return

    set_state("cdk-addons.configured")
    leader_set({"kubernetes-master-addons-ca-in-use": True})
    if ks:
        leader_set({"keystone-cdk-addons-configured": True})
    else:
        leader_set({"keystone-cdk-addons-configured": None})


@retry(times=3, delay_secs=20)
def addons_ready():
    """
    Test if the add ons got installed

    Returns: True is the addons got applied

    """
    try:
        check_call(["cdk-addons.apply"])
        return True
    except CalledProcessError:
        hookenv.log("Addons are not ready yet.")
        return False


@when("ceph-client.connected")
@when_not("kubernetes-control-plane.ceph.pool.created")
def ceph_storage_pool():
    """Once Ceph relation is ready,
    we need to add storage pools.

    :return: None
    """
    hookenv.log("Creating Ceph pools.")
    ceph_client = endpoint_from_flag("ceph-client.connected")

    pools = ["xfs-pool", "ext4-pool"]

    for pool in pools:
        hookenv.status_set("maintenance", "Creating {} pool.".format(pool))
        try:
            ceph_client.create_pool(name=pool, replicas=3)
        except Exception as e:
            hookenv.status_set("blocked", "Error creating {} pool: {}.".format(pool, e))

    set_state("kubernetes-control-plane.ceph.pool.created")


@when("nrpe-external-master.available")
@when_not("nrpe-external-master.initial-config")
def initial_nrpe_config():
    set_state("nrpe-external-master.initial-config")
    update_nrpe_config()


@when("config.changed.authorization-mode")
def switch_auth_mode(forced=False):
    config = hookenv.config()
    mode = config.get("authorization-mode")

    if data_changed("auth-mode", mode) or forced:
        # manage flags to handle rbac related resources
        if mode and "rbac" in mode.lower():
            remove_state("kubernetes-control-plane.remove.rbac")
            set_state("kubernetes-control-plane.create.rbac")
        else:
            remove_state("kubernetes-control-plane.create.rbac")
            set_state("kubernetes-control-plane.remove.rbac")

        # set ourselves up to restart since auth mode has changed
        remove_state("kubernetes-control-plane.components.started")


@when("leadership.is_leader", "kubernetes-control-plane.components.started")
@when_not("kubernetes-control-plane.pod-security-policy.applied")
def create_pod_security_policy_resources():
    pod_security_policy_path = "/root/cdk/pod-security-policy.yaml"
    pod_security_policy = hookenv.config("pod-security-policy")
    if pod_security_policy:
        hookenv.log("Using configuration defined on pod-security-policy option")
        write_file_with_autogenerated_header(
            pod_security_policy_path, pod_security_policy
        )
    else:
        hookenv.log("Using the default rbac-pod-security-policy template")
        render("rbac-pod-security-policy.yaml", pod_security_policy_path, {})

    hookenv.log("Creating pod security policy resources.")
    if kubectl_manifest("apply", pod_security_policy_path):
        set_state("kubernetes-control-plane.pod-security-policy.applied")
    else:
        msg = "Failed to apply {}, will retry.".format(pod_security_policy_path)
        hookenv.log(msg)


@when(
    "leadership.is_leader",
    "kubernetes-control-plane.components.started",
    "kubernetes-control-plane.create.rbac",
)
def create_rbac_resources():
    rbac_proxy_path = "/root/cdk/rbac-proxy.yaml"

    # NB: when metrics and logs are retrieved by proxy, the 'user' is the
    # common name of the cert used to authenticate the proxied request.
    # The CN for /root/cdk/client.crt is 'system:kube-apiserver'
    # (see the send_data handler, above).
    proxy_users = ["client", "system:kube-apiserver"]

    context = {"juju_application": hookenv.service_name(), "proxy_users": proxy_users}
    render("rbac-proxy.yaml", rbac_proxy_path, context)

    hookenv.log("Creating proxy-related RBAC resources.")
    if kubectl_manifest("apply", rbac_proxy_path):
        remove_state("kubernetes-control-plane.create.rbac")
    else:
        msg = "Failed to apply {}, will retry.".format(rbac_proxy_path)
        hookenv.log(msg)


@when("leadership.is_leader", "kubernetes-control-plane.components.started")
@when_not("kubernetes-control-plane.system-monitoring-rbac-role.applied")
def apply_system_monitoring_rbac_role():
    try:
        hookenv.status_set("maintenance", "Applying system:monitoring RBAC role")
        path = "/root/cdk/system-monitoring-rbac-role.yaml"
        render("system-monitoring-rbac-role.yaml", path, {})
        kubectl("apply", "-f", path)
        set_state("kubernetes-control-plane.system-monitoring-rbac-role.applied")
    except Exception:
        hookenv.log(traceback.format_exc())
        hookenv.log("Waiting to retry applying system:monitoring RBAC role")
        return


@when(
    "leadership.is_leader",
    "kubernetes-control-plane.components.started",
    "kubernetes-control-plane.remove.rbac",
)
def remove_rbac_resources():
    rbac_proxy_path = "/root/cdk/rbac-proxy.yaml"
    if os.path.isfile(rbac_proxy_path):
        hookenv.log("Removing proxy-related RBAC resources.")
        if kubectl_manifest("delete", rbac_proxy_path):
            os.remove(rbac_proxy_path)
            remove_state("kubernetes-control-plane.remove.rbac")
        else:
            msg = "Failed to delete {}, will retry.".format(rbac_proxy_path)
            hookenv.log(msg)
    else:
        # if we dont have the yaml, there's nothing for us to do
        remove_state("kubernetes-control-plane.remove.rbac")


@when("kubernetes-control-plane.components.started")
@when("nrpe-external-master.available")
@when_any("config.changed.nagios_context", "config.changed.nagios_servicegroups")
def update_nrpe_config():
    services = ["snap.{}.daemon".format(s) for s in control_plane_services]
    services += [auth_webhook_svc_name]

    plugin = install_nagios_plugin_from_file(
        "templates/nagios_plugin.py", "check_k8s_master.py"
    )
    hostname = nrpe.get_nagios_hostname()
    current_unit = nrpe.get_nagios_unit_name()
    nrpe_setup = nrpe.NRPE(hostname=hostname)
    nrpe.add_init_service_checks(nrpe_setup, services, current_unit)
    nrpe_setup.add_check(
        "k8s-api-server",
        "Verify that the Kubernetes API server is accessible",
        str(plugin),
    )
    nrpe_setup.write()


@when_not("nrpe-external-master.available")
@when("nrpe-external-master.initial-config")
def remove_nrpe_config():
    # List of systemd services for which the checks will be removed
    services = ["snap.{}.daemon".format(s) for s in control_plane_services]

    remove_nagios_plugin("check_k8s_master.py")

    # The current nrpe-external interface doesn't handle a lot of logic,
    # use the charm-helpers code for now.
    hostname = nrpe.get_nagios_hostname()
    nrpe_setup = nrpe.NRPE(hostname=hostname)

    for service in services:
        nrpe_setup.remove_check(shortname=service)
    nrpe_setup.remove_check(shortname="k8s-api-server")
    remove_state("nrpe-external-master.initial-config")


def is_privileged():
    """Return boolean indicating whether or not to set allow-privileged=true."""
    privileged = hookenv.config("allow-privileged").lower()
    if privileged == "auto":
        return (
            is_state("kubernetes-control-plane.gpu.enabled")
            or is_state("ceph-client.available")
            or is_state("endpoint.openstack.joined")
        )
    else:
        return privileged == "true"


@when("config.changed.allow-privileged")
@when("kubernetes-control-plane.components.started")
def on_config_allow_privileged_change():
    """React to changed 'allow-privileged' config value."""
    remove_state("kubernetes-control-plane.components.started")
    remove_state("config.changed.allow-privileged")


@when_any(
    "config.changed.api-extra-args",
    "config.changed.audit-policy",
    "config.changed.audit-webhook-config",
    "config.changed.enable-keystone-authorization",
    "config.changed.service-cidr",
)
@when("kubernetes-control-plane.components.started")
@when("leadership.set.auto_storage_backend")
@when("etcd.available")
def reconfigure_apiserver():
    clear_flag("kubernetes-control-plane.apiserver.configured")


@when("config.changed.controller-manager-extra-args")
@when("kubernetes-control-plane.components.started")
def on_config_controller_manager_extra_args_change():
    configure_controller_manager()


@when("config.changed.scheduler-extra-args")
@when("kubernetes-control-plane.components.started")
def on_config_scheduler_extra_args_change():
    configure_scheduler()


@when("kube-control.gpu.available")
@when("kubernetes-control-plane.components.started")
@when_not("kubernetes-control-plane.gpu.enabled")
def on_gpu_available(kube_control):
    """The remote side (kubernetes-worker) is gpu-enabled.

    We need to run in privileged mode.

    """
    kube_version = get_version("kube-apiserver")
    config = hookenv.config()
    if config["allow-privileged"].lower() == "false" and kube_version < (1, 9):
        return

    remove_state("kubernetes-control-plane.components.started")
    set_state("kubernetes-control-plane.gpu.enabled")


@when("kubernetes-control-plane.gpu.enabled")
@when("kubernetes-control-plane.components.started")
@when_not("kubernetes-control-plane.privileged")
def gpu_with_no_privileged():
    """We were in gpu mode, but the operator has set allow-privileged="false",
    so we can't run in gpu mode anymore.

    """
    if get_version("kube-apiserver") < (1, 9):
        remove_state("kubernetes-control-plane.gpu.enabled")


@when("kube-control.connected")
@when_not("kube-control.gpu.available")
@when("kubernetes-control-plane.gpu.enabled")
@when("kubernetes-control-plane.components.started")
def gpu_departed(kube_control):
    """We were in gpu mode, but the workers informed us there is
    no gpu support anymore.

    """
    remove_state("kubernetes-control-plane.gpu.enabled")


@hook("stop")
def shutdown():
    """Stop the kubernetes control-plane services"""
    for service in control_plane_services:
        service_stop("snap.%s.daemon" % service)


@when(
    "certificates.ca.available",
    "certificates.client.cert.available",
    "authentication.setup",
)
def build_kubeconfig():
    """Gather the relevant data for Kubernetes configuration objects and create
    a config object with that information."""
    local_endpoint = kubernetes_control_plane.get_local_api_endpoint()
    internal_endpoints = kubernetes_control_plane.get_internal_api_endpoints()
    external_endpoints = kubernetes_control_plane.get_external_api_endpoints()

    # Do we have everything we need?
    if ca_crt_path.exists() and internal_endpoints and external_endpoints:
        local_url = kubernetes_control_plane.get_api_url(local_endpoint)
        internal_url = kubernetes_control_plane.get_api_url(internal_endpoints)
        external_url = kubernetes_control_plane.get_api_url(external_endpoints)
        client_pass = get_token("admin")
        if not client_pass:
            # If we made it this far without a password, we're bootstrapping a new
            # cluster. Create a new token so we can build an admin kubeconfig. The
            # auth-webhook service will ack this value from the kubeconfig file,
            # allowing us to continue until the control-plane is started and a proper
            # secret can be created.
            client_pass = (
                hookenv.config("client_password")
                or kubernetes_control_plane.token_generator()
            )
            client_pass = "admin::{}".format(client_pass)

        # drop keystone helper script?
        ks = endpoint_from_flag("keystone-credentials.available")
        if ks:
            script_filename = "kube-keystone.sh"
            keystone_path = os.path.join(os.sep, "home", "ubuntu", script_filename)
            context = {
                "protocol": ks.credentials_protocol(),
                "address": ks.credentials_host(),
                "port": ks.credentials_port(),
                "version": ks.api_version(),
            }
            render(script_filename, keystone_path, context)
        elif is_state("leadership.set.keystone-cdk-addons-configured"):
            # if addons are configured, we're going to do keystone
            # just not yet because we don't have creds
            hookenv.log("Keystone endpoint not found, will retry.")

        cluster_id = None
        aws_iam = endpoint_from_flag("endpoint.aws-iam.available")
        if aws_iam:
            cluster_id = aws_iam.get_cluster_id()

        # Create an absolute path for the kubeconfig file.
        kubeconfig_path = os.path.join(os.sep, "home", "ubuntu", "config")

        # Create the kubeconfig on this system so users can access the cluster.
        hookenv.log("Writing kubeconfig file.")

        if ks:
            create_kubeconfig(
                kubeconfig_path,
                external_url,
                ca_crt_path,
                user="admin",
                token=client_pass,
                keystone=True,
                aws_iam_cluster_id=cluster_id,
            )
        else:
            create_kubeconfig(
                kubeconfig_path,
                external_url,
                ca_crt_path,
                user="admin",
                token=client_pass,
                aws_iam_cluster_id=cluster_id,
            )

        # Make the config file readable by the ubuntu users so juju scp works.
        cmd = ["chown", "ubuntu:ubuntu", kubeconfig_path]
        check_call(cmd)

        # make a kubeconfig for root / the charm
        create_kubeconfig(
            kubeclientconfig_path,
            local_url,
            ca_crt_path,
            user="admin",
            token=client_pass,
        )

        # Create kubernetes configuration in the default location for ubuntu.
        create_kubeconfig(
            "/home/ubuntu/.kube/config",
            internal_url,
            ca_crt_path,
            user="admin",
            token=client_pass,
        )
        # Make the config dir readable by the ubuntu user
        check_call(["chown", "-R", "ubuntu:ubuntu", "/home/ubuntu/.kube"])

        # make a kubeconfig for cdk-addons
        create_kubeconfig(
            cdk_addons_kubectl_config_path,
            local_url,
            ca_crt_path,
            user="admin",
            token=client_pass,
        )

        # make a kubeconfig for our services
        proxy_token = get_token("system:kube-proxy")
        if proxy_token:
            create_kubeconfig(
                kubeproxyconfig_path,
                local_url,
                ca_crt_path,
                token=proxy_token,
                user="kube-proxy",
            )
        controller_manager_token = get_token("system:kube-controller-manager")
        if controller_manager_token:
            create_kubeconfig(
                kubecontrollermanagerconfig_path,
                local_url,
                ca_crt_path,
                token=controller_manager_token,
                user="kube-controller-manager",
            )
        scheduler_token = get_token("system:kube-scheduler")
        if scheduler_token:
            create_kubeconfig(
                kubeschedulerconfig_path,
                local_url,
                ca_crt_path,
                token=scheduler_token,
                user="kube-scheduler",
            )

        cni = endpoint_from_name("cni")
        if cni:
            cni.notify_kubeconfig_changed()


def handle_etcd_relation(reldata):
    """Save the client credentials and set appropriate daemon flags when
    etcd declares itself as available"""
    # Define where the etcd tls files will be kept.
    etcd_dir = "/root/cdk/etcd"

    # Create paths to the etcd client ca, key, and cert file locations.
    ca = os.path.join(etcd_dir, "client-ca.pem")
    key = os.path.join(etcd_dir, "client-key.pem")
    cert = os.path.join(etcd_dir, "client-cert.pem")

    # Save the client credentials (in relation data) to the paths provided.
    reldata.save_client_credentials(key, cert, ca)


def remove_if_exists(path):
    try:
        os.remove(path)
    except FileNotFoundError:
        pass


def write_file_with_autogenerated_header(path, contents):
    with open(path, "w") as f:
        header = "# Autogenerated by kubernetes-control-plane charm"
        f.write(header + "\n" + contents)


@when(
    "etcd.available",
    "cni.available",
    "kubernetes-control-plane.auth-webhook-service.started",
)
@when_not("kubernetes-control-plane.apiserver.configured")
def configure_apiserver():
    etcd_connection_string = endpoint_from_flag(
        "etcd.available"
    ).get_connection_string()
    if not etcd_connection_string:
        # etcd is not returning a connection string. This happens when
        # the control-plane unit disconnects from etcd and is ready to terminate.
        # No point in trying to start control-plane services and fail. Just return.
        return

    # Update unit db service-cidr
    was_service_cidr_expanded = kubernetes_control_plane.is_service_cidr_expansion()
    kubernetes_control_plane.freeze_service_cidr()

    cluster_cidr = kubernetes_common.cluster_cidr()
    service_cidr = kubernetes_control_plane.service_cidr()

    api_opts = {}

    if is_privileged():
        api_opts["allow-privileged"] = "true"
        set_state("kubernetes-control-plane.privileged")
    else:
        api_opts["allow-privileged"] = "false"
        remove_state("kubernetes-control-plane.privileged")

    # Handle static options for now
    api_opts["service-cluster-ip-range"] = service_cidr
    feature_gates = []
    if kubernetes_common.is_dual_stack(cluster_cidr):
        feature_gates.append("IPv6DualStack=true")
    api_opts["min-request-timeout"] = "300"
    api_opts["v"] = "4"
    api_opts["tls-cert-file"] = str(server_crt_path)
    api_opts["tls-private-key-file"] = str(server_key_path)
    api_opts["tls-cipher-suites"] = ",".join(tls_ciphers_intermediate)
    api_opts["kubelet-certificate-authority"] = str(ca_crt_path)
    api_opts["kubelet-client-certificate"] = str(client_crt_path)
    api_opts["kubelet-client-key"] = str(client_key_path)
    api_opts["logtostderr"] = "true"
    api_opts["storage-backend"] = getStorageBackend()
    api_opts["profiling"] = "false"

    api_opts["anonymous-auth"] = "false"
    api_opts["authentication-token-webhook-cache-ttl"] = "1m0s"
    api_opts["authentication-token-webhook-config-file"] = auth_webhook_conf
    api_opts["service-account-issuer"] = "https://kubernetes.default.svc"
    api_opts["service-account-signing-key-file"] = "/root/cdk/serviceaccount.key"
    api_opts["service-account-key-file"] = "/root/cdk/serviceaccount.key"
    api_opts[
        "kubelet-preferred-address-types"
    ] = "InternalIP,Hostname,InternalDNS,ExternalDNS,ExternalIP"
    api_opts["encryption-provider-config"] = str(encryption_config_path())
    if kubernetes_common.is_ipv6(cluster_cidr):
        api_opts["bind-address"] = "::"
    if kubernetes_common.is_ipv6_preferred(cluster_cidr):
        api_opts["advertise-address"] = get_ingress_address6("kube-control")
    else:
        api_opts["advertise-address"] = get_ingress_address("kube-control")

    etcd_dir = "/root/cdk/etcd"
    etcd_ca = os.path.join(etcd_dir, "client-ca.pem")
    etcd_key = os.path.join(etcd_dir, "client-key.pem")
    etcd_cert = os.path.join(etcd_dir, "client-cert.pem")

    api_opts["etcd-cafile"] = etcd_ca
    api_opts["etcd-keyfile"] = etcd_key
    api_opts["etcd-certfile"] = etcd_cert
    api_opts["etcd-servers"] = etcd_connection_string

    # In Kubernetes 1.10 and later, some admission plugins are enabled by
    # default. The current list of default plugins can be found at
    # https://bit.ly/2meP9XT, listed under the '--enable-admission-plugins'
    # option.
    #
    # The list below need only include the plugins we want to enable
    # in addition to the defaults.

    admission_plugins = [
        "PersistentVolumeLabel",
        "PodSecurityPolicy",
        "NodeRestriction",
    ]

    auth_mode = hookenv.config("authorization-mode")

    ks = endpoint_from_flag("keystone-credentials.available")
    if ks:
        ks_ip = get_service_ip("k8s-keystone-auth-service", errors_fatal=False)
        if ks_ip:
            os.makedirs(keystone_root, exist_ok=True)

            keystone_webhook = keystone_root + "/webhook.yaml"
            context = {}
            context["keystone_service_cluster_ip"] = ks_ip
            render("keystone-api-server-webhook.yaml", keystone_webhook, context)

            if hookenv.config("enable-keystone-authorization"):
                # if user wants authorization, enable it
                if "Webhook" not in auth_mode:
                    auth_mode += ",Webhook"
                api_opts["authorization-webhook-config-file"] = keystone_webhook  # noqa
            set_state("keystone.apiserver.configured")
        else:
            hookenv.log("Unable to find k8s-keystone-auth-service. Will retry")
            # Note that we can get into a nasty state here
            # if the user has specified webhook and they're relying on
            # keystone auth to handle that, the api server will fail to
            # start because we push it Webhook and no webhook config.
            # We can't generate the config because we can't talk to the
            # apiserver to get the ip of the service to put into the
            # webhook template. A chicken and egg problem. To fix this,
            # remove Webhook if keystone is related and trying to come
            # up until we can find the service IP.
            if "Webhook" in auth_mode:
                auth_mode = ",".join(
                    [i for i in auth_mode.split(",") if i != "Webhook"]
                )
            remove_state("keystone.apiserver.configured")
    elif is_state("leadership.set.keystone-cdk-addons-configured"):
        hookenv.log("Keystone endpoint not found, will retry.")

    api_opts["authorization-mode"] = auth_mode
    api_opts["enable-admission-plugins"] = ",".join(admission_plugins)

    kube_version = get_version("kube-apiserver")

    if kube_version > (1, 6) and hookenv.config("api-aggregation-extension"):
        api_opts["requestheader-client-ca-file"] = str(ca_crt_path)
        api_opts["requestheader-allowed-names"] = "system:kube-apiserver,client"
        api_opts["requestheader-extra-headers-prefix"] = "X-Remote-Extra-"
        api_opts["requestheader-group-headers"] = "X-Remote-Group"
        api_opts["requestheader-username-headers"] = "X-Remote-User"
        api_opts["proxy-client-cert-file"] = str(client_crt_path)
        api_opts["proxy-client-key-file"] = str(client_key_path)
        api_opts["enable-aggregator-routing"] = "true"
        api_opts["client-ca-file"] = str(ca_crt_path)

    api_cloud_config_path = cloud_config_path("kube-apiserver")
    if has_external_cloud_provider():
        api_opts["cloud-provider"] = "external"
    elif is_state("endpoint.aws.ready"):
        api_opts["cloud-provider"] = "aws"
        feature_gates.append("CSIMigrationAWS=false")
    elif is_state("endpoint.gcp.ready"):
        api_opts["cloud-provider"] = "gce"
        api_opts["cloud-config"] = str(api_cloud_config_path)
        feature_gates.append("CSIMigrationGCE=false")
    elif is_state("endpoint.vsphere.ready") and get_version("kube-apiserver") >= (
        1,
        12,
    ):
        api_opts["cloud-provider"] = "vsphere"
        api_opts["cloud-config"] = str(api_cloud_config_path)
    elif is_state("endpoint.azure.ready"):
        api_opts["cloud-provider"] = "azure"
        api_opts["cloud-config"] = str(api_cloud_config_path)
        feature_gates.append("CSIMigrationAzureDisk=false")

    api_opts["feature-gates"] = ",".join(feature_gates)

    audit_root = "/root/cdk/audit"
    os.makedirs(audit_root, exist_ok=True)

    audit_log_path = audit_root + "/audit.log"
    api_opts["audit-log-path"] = audit_log_path
    api_opts["audit-log-maxage"] = "30"
    api_opts["audit-log-maxsize"] = "100"
    api_opts["audit-log-maxbackup"] = "10"

    audit_policy_path = audit_root + "/audit-policy.yaml"
    audit_policy = hookenv.config("audit-policy")
    if audit_policy:
        write_file_with_autogenerated_header(audit_policy_path, audit_policy)
        api_opts["audit-policy-file"] = audit_policy_path
    else:
        remove_if_exists(audit_policy_path)

    audit_webhook_config_path = audit_root + "/audit-webhook-config.yaml"
    audit_webhook_config = hookenv.config("audit-webhook-config")
    if audit_webhook_config:
        write_file_with_autogenerated_header(
            audit_webhook_config_path, audit_webhook_config
        )
        api_opts["audit-webhook-config-file"] = audit_webhook_config_path
    else:
        remove_if_exists(audit_webhook_config_path)

    configure_kubernetes_service(
        configure_prefix, "kube-apiserver", api_opts, "api-extra-args"
    )
    service_restart("snap.kube-apiserver.daemon")

    if was_service_cidr_expanded and is_state("leadership.is_leader"):
        set_flag("kubernetes-control-plane.had-service-cidr-expanded")

    set_flag("kubernetes-control-plane.apiserver.configured")
    if kubernetes_control_plane.check_service("kube-apiserver"):
        set_flag("kubernetes-control-plane.apiserver.running")


@when("kubernetes-control-plane.apiserver.configured")
@when_not("kubernetes-control-plane.apiserver.running")
def check_apiserver():
    if kubernetes_control_plane.check_service("kube-apiserver"):
        set_flag("kubernetes-control-plane.apiserver.running")


@when(
    "kubernetes-control-plane.had-service-cidr-expanded",
    "kubernetes-control-plane.apiserver.configured",
    "leadership.is_leader",
)
def update_for_service_cidr_expansion():
    # We just restarted the API server, so there's a decent chance it's
    # not up yet. Keep trying to get the svcs list until we can; get_svcs
    # has a built-in retry and delay, so this should try for around 30s.
    def _wait_for_svc_ip():
        for attempt in range(10):
            svcs = get_svcs()
            if svcs:
                svc_ip = {
                    svc["metadata"]["name"]: svc["spec"]["clusterIP"]
                    for svc in svcs["items"]
                }.get("kubernetes")
                if svc_ip:
                    return svc_ip
        else:
            return None

    hookenv.log("service-cidr expansion: Waiting for API service")
    # First network is the default, which is used for the API service's address.
    # This logic will likely need to change once dual-stack services are
    # supported: https://bit.ly/2YlbxOx
    expected_service_ip = kubernetes_control_plane.get_kubernetes_service_ips()[0]
    actual_service_ip = _wait_for_svc_ip()
    if not actual_service_ip:
        hookenv.log("service-cidr expansion: Timed out waiting for API service")
        return
    try:
        if actual_service_ip != expected_service_ip:
            hookenv.log("service-cidr expansion: Deleting service kubernetes")
            kubectl("delete", "service", "kubernetes")
            actual_service_ip = _wait_for_svc_ip()
            if not actual_service_ip:
                # we might need another restart to get the service recreated
                hookenv.log(
                    "service-cidr expansion: Timed out waiting for "
                    "the service to return; restarting API server"
                )
                clear_flag("kubernetes-control-plane.apiserver.configured")
                return
            if actual_service_ip != expected_service_ip:
                raise ValueError(
                    "Unexpected service IP: {} != {}".format(
                        actual_service_ip, expected_service_ip
                    )
                )

        # Restart the cdk-addons
        # Get deployments/daemonsets/statefulsets
        hookenv.log("service-cidr expansion: Restart the cdk-addons")
        output = kubectl(
            "get",
            "daemonset,deployment,statefulset",
            "-o",
            "json",
            "--all-namespaces",
            "-l",
            "cdk-restart-on-ca-change=true",
        ).decode("UTF-8")
        deployments = json.loads(output)["items"]

        # Now restart the addons
        for deployment in deployments:
            kind = deployment["kind"]
            namespace = deployment["metadata"]["namespace"]
            name = deployment["metadata"]["name"]
            hookenv.log("Restarting addon: {0} {1} {2}".format(kind, namespace, name))
            kubectl("rollout", "restart", kind + "/" + name, "-n", namespace)
    except CalledProcessError:
        # the kubectl calls already log the command and don't capture stderr,
        # so logging the exception is a bit superfluous
        hookenv.log("service-cidr expansion: failed to restart components")
    else:
        clear_flag("kubernetes-control-plane.had-service-cidr-expanded")


def configure_controller_manager():
    controller_opts = {}
    cluster_cidr = kubernetes_common.cluster_cidr()
    service_cidr = kubernetes_control_plane.service_cidr()

    # Default to 3 minute resync. TODO: Make this configurable?
    controller_opts["min-resync-period"] = "3m"
    controller_opts["v"] = "2"
    controller_opts["root-ca-file"] = str(ca_crt_path)
    controller_opts["logtostderr"] = "true"
    controller_opts["kubeconfig"] = kubecontrollermanagerconfig_path
    controller_opts["authorization-kubeconfig"] = kubecontrollermanagerconfig_path
    controller_opts["authentication-kubeconfig"] = kubecontrollermanagerconfig_path
    controller_opts["use-service-account-credentials"] = "true"
    controller_opts["service-account-private-key-file"] = "/root/cdk/serviceaccount.key"
    controller_opts["tls-cert-file"] = str(server_crt_path)
    controller_opts["tls-private-key-file"] = str(server_key_path)
    controller_opts["cluster-name"] = leader_get("cluster_tag")
    controller_opts["terminated-pod-gc-threshold"] = "12500"
    controller_opts["profiling"] = "false"
    controller_opts["service-cluster-ip-range"] = service_cidr
    controller_opts["cluster-cidr"] = cluster_cidr
    feature_gates = ["RotateKubeletServerCertificate=true"]
    if kubernetes_common.is_dual_stack(cluster_cidr):
        feature_gates.append("IPv6DualStack=true")
    net_ipv6 = kubernetes_common.get_ipv6_network(cluster_cidr)
    if net_ipv6:
        controller_opts["node-cidr-mask-size-ipv6"] = net_ipv6.prefixlen

    cm_cloud_config_path = cloud_config_path("kube-controller-manager")
    if has_external_cloud_provider():
        controller_opts["cloud-provider"] = "external"
    elif is_state("endpoint.aws.ready"):
        controller_opts["cloud-provider"] = "aws"
        feature_gates.append("CSIMigrationAWS=false")
    elif is_state("endpoint.gcp.ready"):
        controller_opts["cloud-provider"] = "gce"
        controller_opts["cloud-config"] = str(cm_cloud_config_path)
        feature_gates.append("CSIMigrationGCE=false")
    elif is_state("endpoint.vsphere.ready") and get_version("kube-apiserver") >= (
        1,
        12,
    ):
        controller_opts["cloud-provider"] = "vsphere"
        controller_opts["cloud-config"] = str(cm_cloud_config_path)
    elif is_state("endpoint.azure.ready"):
        controller_opts["cloud-provider"] = "azure"
        controller_opts["cloud-config"] = str(cm_cloud_config_path)
        feature_gates.append("CSIMigrationAzureDisk=false")

    controller_opts["feature-gates"] = ",".join(feature_gates)

    configure_kubernetes_service(
        configure_prefix,
        "kube-controller-manager",
        controller_opts,
        "controller-manager-extra-args",
    )
    service_restart("snap.kube-controller-manager.daemon")


def configure_scheduler():
    kube_scheduler_config_path = "/root/cdk/kube-scheduler-config.yaml"

    scheduler_opts = {}

    scheduler_opts["v"] = "2"
    scheduler_opts["logtostderr"] = "true"
    scheduler_opts["profiling"] = "false"
    scheduler_opts["config"] = kube_scheduler_config_path

    feature_gates = []

    if is_state("endpoint.aws.ready"):
        feature_gates.append("CSIMigrationAWS=false")
    elif is_state("endpoint.gcp.ready"):
        feature_gates.append("CSIMigrationGCE=false")
    elif is_state("endpoint.azure.ready"):
        feature_gates.append("CSIMigrationAzureDisk=false")

    scheduler_opts["feature-gates"] = ",".join(feature_gates)

    scheduler_ver = get_version("kube-scheduler")
    if scheduler_ver >= (1, 23):
        api_ver = "v1beta2"
    elif scheduler_ver >= (1, 19):
        api_ver = "v1beta1"
    elif scheduler_ver >= (1, 18):
        api_ver = "v1alpha2"
    else:
        api_ver = "v1alpha1"

    host.write_file(
        path=kube_scheduler_config_path,
        perms=0o600,
        content=yaml.safe_dump(
            {
                "apiVersion": "kubescheduler.config.k8s.io/{}".format(api_ver),
                "kind": "KubeSchedulerConfiguration",
                "clientConnection": {"kubeconfig": kubeschedulerconfig_path},
            }
        ),
    )

    configure_kubernetes_service(
        configure_prefix, "kube-scheduler", scheduler_opts, "scheduler-extra-args"
    )

    service_restart("snap.kube-scheduler.daemon")


def setup_tokens(token, username, user, groups=None):
    """Create a token for kubernetes authentication.

    Create a new secret if known_tokens have been migrated. Otherwise,
    add an entry to the 'known_tokens.csv' file.
    """
    if not token:
        token = kubernetes_control_plane.token_generator()
    if is_flag_set("kubernetes-control-plane.token-auth.migrated"):
        # We need the apiserver before we can create secrets.
        if is_flag_set("kubernetes-control-plane.apiserver.configured"):
            kubernetes_control_plane.create_secret(token, username, user, groups)
        else:
            hookenv.log("Delaying secret creation until the apiserver is configured.")
    else:
        kubernetes_control_plane.create_known_token(token, username, user, groups)


def get_token(username):
    """Fetch a token for the given username.

    Grab a token from the given user's secret if known_tokens have been
    migrated. Otherwise, fetch it from the 'known_tokens.csv' file.
    """
    if is_flag_set("kubernetes-control-plane.token-auth.migrated"):
        return kubernetes_common.get_secret_password(username)
    else:
        return kubernetes_control_plane.get_csv_password("known_tokens.csv", username)


def set_token(password, save_salt):
    """Store a token so it can be recalled later by token_generator.

    param: password - the password to be stored
    param: save_salt - the key to store the value of the token."""
    db.set(save_salt, password)
    return db.get(save_salt)


@retry(times=3, delay_secs=1)
def get_pods(namespace="default"):
    try:
        output = kubectl(
            "get", "po", "-n", namespace, "-o", "json", "--request-timeout", "10s"
        ).decode("UTF-8")
        result = json.loads(output)
    except CalledProcessError:
        hookenv.log("failed to get {} pod status".format(namespace))
        return None
    return result


@retry(times=3, delay_secs=1)
def get_svcs(namespace="default"):
    try:
        output = kubectl(
            "get", "svc", "-n", namespace, "-o", "json", "--request-timeout", "10s"
        ).decode("UTF-8")
        result = json.loads(output)
    except CalledProcessError:
        hookenv.log("failed to get {} service status".format(namespace))
        return None
    return result


class FailedToGetPodStatus(Exception):
    pass


def get_kube_system_pods_not_running():
    """Check pod status in the kube-system namespace. Throws
    FailedToGetPodStatus if unable to determine pod status. This can
    occur when the api server is not currently running. On success,
    returns a list of pods that are not currently running
    or an empty list if all are running."""

    result = get_pods("kube-system")
    if result is None:
        raise FailedToGetPodStatus

    hookenv.log(
        "Checking system pods status: {}".format(
            ", ".join(
                "=".join([pod["metadata"]["name"], pod["status"]["phase"]])
                for pod in result["items"]
            )
        )
    )

    # Pods that are Running or Evicted (which should re-spawn) are
    # considered running
    not_running = [
        pod
        for pod in result["items"]
        if pod["status"]["phase"] != "Running"
        and pod["status"].get("reason", "") != "Evicted"
    ]

    pending = [pod for pod in result["items"] if pod["status"]["phase"] == "Pending"]
    any_pending = len(pending) > 0
    if is_state("endpoint.gcp.ready") and any_pending:
        poke_network_unavailable()
        return not_running

    return not_running


def poke_network_unavailable():
    """
    Work around https://github.com/kubernetes/kubernetes/issues/44254 by
    manually poking the status into the API server to tell the nodes they have
    a network route.

    This is needed because kubelet sets the NetworkUnavailable flag and expects
    the network plugin to clear it, which only kubenet does. There is some
    discussion about refactoring the affected code but nothing has happened
    in a while.
    """
    internal_endpoints = kubernetes_control_plane.get_internal_api_endpoints()
    internal_url = kubernetes_control_plane.get_api_url(internal_endpoints)

    client_token = get_token("admin")
    http_header = ("Authorization", "Bearer {}".format(client_token))

    try:
        output = kubectl("get", "nodes", "-o", "json").decode("utf-8")
        nodes = json.loads(output)["items"]
    except CalledProcessError:
        hookenv.log("failed to get kube-system nodes")
        return
    except (KeyError, json.JSONDecodeError) as e:
        hookenv.log(
            "failed to parse kube-system node status " "({}): {}".format(e, output),
            hookenv.ERROR,
        )
        return

    for node in nodes:
        node_name = node["metadata"]["name"]
        url = "{}/api/v1/nodes/{}/status".format(internal_url, node_name)
        req = Request(url)
        req.add_header(*http_header)
        with urlopen(req) as response:
            code = response.getcode()
            body = response.read().decode("utf8")
        if code != 200:
            hookenv.log(
                "failed to get node status from {} [{}]: {}".format(url, code, body),
                hookenv.ERROR,
            )
            return
        try:
            node_info = json.loads(body)
            conditions = node_info["status"]["conditions"]
            i = [c["type"] for c in conditions].index("NetworkUnavailable")
            if conditions[i]["status"] == "True":
                hookenv.log("Clearing NetworkUnavailable from {}".format(node_name))
                conditions[i] = {
                    "type": "NetworkUnavailable",
                    "status": "False",
                    "reason": "RouteCreated",
                    "message": "Manually set through k8s api",
                }
                req = Request(
                    url,
                    method="PUT",
                    data=json.dumps(node_info).encode("utf8"),
                    headers={"Content-Type": "application/json"},
                )
                req.add_header(*http_header)
                with urlopen(req) as response:
                    code = response.getcode()
                    body = response.read().decode("utf8")
                if code not in (200, 201, 202):
                    hookenv.log(
                        "failed to update node status [{}]: {}".format(code, body),
                        hookenv.ERROR,
                    )
                    return
        except (json.JSONDecodeError, KeyError):
            hookenv.log("failed to parse node status: {}".format(body), hookenv.ERROR)
            return


def apiserverVersion():
    cmd = "kube-apiserver --version".split()
    version_string = check_output(cmd).decode("utf-8")
    return tuple(int(q) for q in re.findall("[0-9]+", version_string)[:3])


def touch(fname):
    try:
        os.utime(fname, None)
    except OSError:
        open(fname, "a").close()


def getStorageBackend():
    storage_backend = hookenv.config("storage-backend")
    if storage_backend == "auto":
        storage_backend = leader_get("auto_storage_backend")
    return storage_backend


@when("leadership.is_leader")
@when_not("leadership.set.cluster_tag")
def create_cluster_tag():
    cluster_tag = "kubernetes-{}".format(
        kubernetes_control_plane.token_generator().lower()
    )
    leader_set(cluster_tag=cluster_tag)


@when("leadership.set.cluster_tag", "kube-control.connected")
def send_cluster_tag():
    cluster_tag = leader_get("cluster_tag")
    kube_control = endpoint_from_flag("kube-control.connected")
    kube_control.set_cluster_tag(cluster_tag)


@when_not("kube-control.connected")
def clear_cluster_tag_sent():
    remove_state("kubernetes-control-plane.cluster-tag-sent")


@when_any(
    "endpoint.aws.joined",
    "endpoint.gcp.joined",
    "endpoint.openstack.joined",
    "endpoint.vsphere.joined",
    "endpoint.azure.joined",
)
@when_not("kubernetes-control-plane.cloud.ready")
def set_cloud_pending():
    k8s_version = get_version("kube-apiserver")
    k8s_1_11 = k8s_version >= (1, 11)
    k8s_1_12 = k8s_version >= (1, 12)
    vsphere_joined = is_state("endpoint.vsphere.joined")
    azure_joined = is_state("endpoint.azure.joined")
    if (vsphere_joined and not k8s_1_12) or (azure_joined and not k8s_1_11):
        set_state("kubernetes-control-plane.cloud.blocked")
    else:
        remove_state("kubernetes-control-plane.cloud.blocked")
    set_state("kubernetes-control-plane.cloud.pending")


@when_any("endpoint.aws.joined", "endpoint.gcp.joined", "endpoint.azure.joined")
@when("leadership.set.cluster_tag")
@when_not("kubernetes-control-plane.cloud.request-sent")
def request_integration():
    hookenv.status_set("maintenance", "requesting cloud integration")
    cluster_tag = leader_get("cluster_tag")
    if is_state("endpoint.aws.joined"):
        cloud = endpoint_from_flag("endpoint.aws.joined")
        cloud.tag_instance(
            {
                "kubernetes.io/cluster/{}".format(cluster_tag): "owned",
                "k8s.io/role/master": "true",  # wokeignore:rule=master
            }
        )
        cloud.tag_instance_security_group(
            {
                "kubernetes.io/cluster/{}".format(cluster_tag): "owned",
            }
        )
        cloud.tag_instance_subnet(
            {
                "kubernetes.io/cluster/{}".format(cluster_tag): "owned",
            }
        )
        cloud.enable_object_storage_management(["kubernetes-*"])
        cloud.enable_load_balancer_management()
    elif is_state("endpoint.gcp.joined"):
        cloud = endpoint_from_flag("endpoint.gcp.joined")
        cloud.label_instance(
            {
                "k8s-io-cluster-name": cluster_tag,
                "k8s-io-role-master": "master",  # wokeignore:rule=master
            }
        )
        cloud.enable_object_storage_management()
        cloud.enable_security_management()
    elif is_state("endpoint.azure.joined"):
        cloud = endpoint_from_flag("endpoint.azure.joined")
        cloud.tag_instance(
            {
                "k8s-io-cluster-name": cluster_tag,
                "k8s-io-role-master": "master",  # wokeignore:rule=master
            }
        )
        cloud.enable_object_storage_management()
        cloud.enable_security_management()
        cloud.enable_loadbalancer_management()
    cloud.enable_instance_inspection()
    cloud.enable_network_management()
    cloud.enable_dns_management()
    cloud.enable_block_storage_management()
    set_state("kubernetes-control-plane.cloud.request-sent")


@when_none(
    "endpoint.aws.joined",
    "endpoint.gcp.joined",
    "endpoint.openstack.joined",
    "endpoint.vsphere.joined",
    "endpoint.azure.joined",
)
@when_any(
    "kubernetes-control-plane.cloud.pending",
    "kubernetes-control-plane.cloud.request-sent",
    "kubernetes-control-plane.cloud.blocked",
    "kubernetes-control-plane.cloud.ready",
)
def clear_cloud_flags():
    remove_state("kubernetes-control-plane.cloud.pending")
    remove_state("kubernetes-control-plane.cloud.request-sent")
    remove_state("kubernetes-control-plane.cloud.blocked")
    remove_state("kubernetes-control-plane.cloud.ready")
    clear_flag("kubernetes-control-plane.apiserver.configured")
    clear_flag("kubernetes-control-plane.kubelet.configured")
    _kick_controller_manager()


@when_any(
    "endpoint.aws.ready",
    "endpoint.gcp.ready",
    "endpoint.openstack.ready",
    "endpoint.vsphere.ready",
    "endpoint.azure.ready",
)
@when_not(
    "kubernetes-control-plane.cloud.blocked", "kubernetes-control-plane.cloud.ready"
)
def cloud_ready():
    if is_state("endpoint.gcp.ready"):
        write_gcp_snap_config("kube-apiserver")
        write_gcp_snap_config("kube-controller-manager")
        write_gcp_snap_config("kubelet")
    elif is_state("endpoint.vsphere.ready"):
        _write_vsphere_snap_config("kube-apiserver")
        _write_vsphere_snap_config("kube-controller-manager")
    elif is_state("endpoint.azure.ready"):
        write_azure_snap_config("kube-apiserver")
        write_azure_snap_config("kube-controller-manager")
        write_azure_snap_config("kubelet")
    remove_state("kubernetes-control-plane.cloud.pending")
    set_state("kubernetes-control-plane.cloud.ready")
    remove_state("kubernetes-control-plane.components.started")  # force restart


@when("kubernetes-control-plane.cloud.ready")
@when_any(
    "endpoint.openstack.ready.changed",
    "endpoint.vsphere.ready.changed",
    "endpoint.azure.ready.changed",
)
def update_cloud_config():
    """Signal that cloud config has changed.

    Some clouds (openstack, vsphere) support runtime config that needs to be
    reflected in the k8s cloud config files when changed. Manage flags to
    ensure this happens.
    """
    if is_state("endpoint.openstack.ready.changed"):
        remove_state("endpoint.openstack.ready.changed")
        set_state("kubernetes-control-plane.openstack.changed")
    if is_state("endpoint.vsphere.ready.changed"):
        remove_state("kubernetes-control-plane.cloud.ready")
        remove_state("endpoint.vsphere.ready.changed")
    if is_state("endpoint.azure.ready.changed"):
        remove_state("kubernetes-control-plane.cloud.ready")
        remove_state("endpoint.azure.ready.changed")


def _cdk_addons_template_path():
    return Path("/snap/cdk-addons/current/templates")


def _write_vsphere_snap_config(component):
    # vsphere requires additional cloud config
    vsphere = endpoint_from_flag("endpoint.vsphere.ready")

    # NB: vsphere provider will ask kube-apiserver and -controller-manager to
    # find a uuid from sysfs unless a global config value is set. Our strict
    # snaps cannot read sysfs, so let's do it in the charm. An invalid uuid is
    # not fatal for storage, but it will muddy the logs; try to get it right.
    uuid = _get_vmware_uuid()

    comp_cloud_config_path = cloud_config_path(component)
    comp_cloud_config_path.write_text(
        "\n".join(
            [
                "[Global]",
                "insecure-flag = true",
                'datacenters = "{}"'.format(vsphere.datacenter),
                'vm-uuid = "VMware-{}"'.format(uuid),
                '[VirtualCenter "{}"]'.format(vsphere.vsphere_ip),
                'user = "{}"'.format(vsphere.user),
                'password = "{}"'.format(vsphere.password),
                "[Workspace]",
                'server = "{}"'.format(vsphere.vsphere_ip),
                'datacenter = "{}"'.format(vsphere.datacenter),
                'default-datastore = "{}"'.format(vsphere.datastore),
                'folder = "{}"'.format(vsphere.folder),
                'resourcepool-path = "{}"'.format(vsphere.respool_path),
                "[Disk]",
                'scsicontrollertype = "pvscsi"',
            ]
        )
    )


@when("config.changed.keystone-policy")
@when("kubernetes-control-plane.keystone-policy-handled")
def regen_keystone_policy():
    clear_flag("kubernetes-control-plane.keystone-policy-handled")


@when(
    "keystone-credentials.available",
    "leadership.is_leader",
    "kubernetes-control-plane.apiserver.configured",
)
@when_not("kubernetes-control-plane.keystone-policy-handled")
def generate_keystone_configmap():
    keystone_policy = hookenv.config("keystone-policy")
    if keystone_policy:
        os.makedirs(keystone_root, exist_ok=True)
        write_file_with_autogenerated_header(keystone_policy_path, keystone_policy)
        if kubectl_manifest("apply", keystone_policy_path):
            set_flag("kubernetes-control-plane.keystone-policy-handled")
            clear_flag("kubernetes-control-plane.keystone-policy-error")
        else:
            set_flag("kubernetes-control-plane.keystone-policy-error")
    else:
        # a missing policy configmap will crashloop the pods, but...
        # what do we do in this situation. We could just do nothing,
        # but that isn't cool for the user so we surface an error
        # and wait for them to fix it.
        set_flag("kubernetes-control-plane.keystone-policy-error")

    # note that information is surfaced to the user in the code above where we
    # write status. It will notify the user we are waiting on the policy file
    # to apply if the keystone-credentials.available flag is set, but
    # kubernetes-control-plane.keystone-policy-handled is not set.


@when("leadership.is_leader", "kubernetes-control-plane.keystone-policy-handled")
@when_not("keystone-credentials.available")
def remove_keystone():
    clear_flag("kubernetes-control-plane.apiserver.configured")
    if not os.path.exists(keystone_policy_path):
        clear_flag("kubernetes-control-plane.keystone-policy-handled")
    elif kubectl_manifest("delete", keystone_policy_path):
        os.remove(keystone_policy_path)
        clear_flag("kubernetes-control-plane.keystone-policy-handled")


@when("keystone-credentials.connected")
def setup_keystone_user():
    # This seems silly, but until we request a user from keystone
    # we don't get information about the keystone server...
    ks = endpoint_from_flag("keystone-credentials.connected")
    ks.request_credentials("k8s")


def _kick_controller_manager():
    if is_flag_set("kubernetes-control-plane.components.started"):
        configure_controller_manager()


@when(
    "keystone.credentials.configured", "leadership.set.keystone-cdk-addons-configured"
)
@when_not("keystone.apiserver.configured")
def keystone_kick_apiserver():
    clear_flag("kubernetes-control-plane.apiserver.configured")


@when(
    "keystone-credentials.available",
    "certificates.ca.available",
    "certificates.client.cert.available",
    "authentication.setup",
    "etcd.available",
    "leadership.set.keystone-cdk-addons-configured",
)
def keystone_config():
    # first, we have to have the service set up before we can render this stuff
    ks = endpoint_from_flag("keystone-credentials.available")
    data = {
        "host": ks.credentials_host(),
        "proto": ks.credentials_protocol(),
        "port": ks.credentials_port(),
        "version": ks.api_version(),
    }
    if data_changed("keystone", data):
        remove_state("keystone.credentials.configured")
        clear_flag("kubernetes-control-plane.apiserver.configured")
        build_kubeconfig()
        generate_keystone_configmap()
        set_state("keystone.credentials.configured")


@when("layer.vault-kv.app-kv.set.encryption_key", "layer.vaultlocker.ready")
@when_not("kubernetes-control-plane.secure-storage.created")
def create_secure_storage():
    encryption_conf_dir = encryption_config_path().parent
    encryption_conf_dir.mkdir(mode=0o700, parents=True, exist_ok=True)
    try:
        vaultlocker.create_encrypted_loop_mount(encryption_conf_dir)
    except vaultlocker.VaultLockerError:
        # One common cause of this would be deploying on lxd.
        # Should this be more fatal?
        hookenv.log(
            "Unable to create encrypted mount for storing encryption config.\n"
            "{}".format(traceback.format_exc()),
            level=hookenv.ERROR,
        )
        set_flag("kubernetes-control-plane.secure-storage.failed")
        clear_flag("kubernetes-control-plane.secure-storage.created")
    else:
        # TODO: If Vault isn't available, it's probably still better to encrypt
        # anyway and store the key in plaintext and leadership than to just
        # give up on encryption entirely.
        _write_encryption_config()
        # prevent an unnecessary service restart on this
        # unit since we've already handled the change
        clear_flag("layer.vault-kv.app-kv.changed.encryption_key")
        # mark secure storage as ready
        set_flag("kubernetes-control-plane.secure-storage.created")
        clear_flag("kubernetes-control-plane.secure-storage.failed")
        # restart to regen config
        clear_flag("kubernetes-control-plane.apiserver.configured")


@when_not("layer.vaultlocker.ready")
@when("kubernetes-control-plane.secure-storage.created")
def revert_secure_storage():
    clear_flag("kubernetes-control-plane.secure-storage.created")
    clear_flag("kubernetes-control-plane.secure-storage.failed")
    clear_flag("kubernetes-control-plane.apiserver.configured")


@when("leadership.is_leader", "layer.vault-kv.ready")
@when_not("layer.vault-kv.app-kv.set.encryption_key")
def generate_encryption_key():
    app_kv = vault_kv.VaultAppKV()
    app_kv["encryption_key"] = kubernetes_control_plane.token_generator(32)


@when(
    "layer.vault-kv.app-kv.changed.encryption_key",
    "kubernetes-control-plane.secure-storage.created",
)
def restart_apiserver_for_encryption_key():
    clear_flag("kubernetes-control-plane.apiserver.configured")
    clear_flag("layer.vault-kv.app-kv.changed.encryption_key")


def _write_encryption_config():
    app_kv = vault_kv.VaultAppKV()
    encryption_config_path().parent.mkdir(parents=True, exist_ok=True)
    secret = app_kv["encryption_key"]
    secret = base64.b64encode(secret.encode("utf8")).decode("utf8")
    host.write_file(
        path=str(encryption_config_path()),
        perms=0o600,
        content=yaml.safe_dump(
            {
                "kind": "EncryptionConfig",
                "apiVersion": "v1",
                "resources": [
                    {
                        "resources": ["secrets"],
                        "providers": [
                            {
                                "aescbc": {
                                    "keys": [
                                        {
                                            "name": "key1",
                                            "secret": secret,
                                        }
                                    ],
                                }
                            },
                            {"identity": {}},
                        ],
                    }
                ],
            }
        ),
    )


@when_any("config.changed.pod-security-policy")
def pod_security_policy_config_changed():
    clear_flag("kubernetes-control-plane.pod-security-policy.applied")


@when_any("config.changed.ha-cluster-vip", "config.changed.ha-cluster-dns")
def haconfig_changed():
    clear_flag("hacluster-configured")


@when("ha.connected", "kubernetes-control-plane.components.started")
@when_not("hacluster-configured")
def configure_hacluster():
    # get a new cert
    if is_flag_set("certificates.available"):
        send_data()
    # update workers
    if is_flag_set("kube-control.connected"):
        send_api_urls()
    if is_flag_set("kube-api-endpoint.available"):
        push_service_data()

    set_flag("hacluster-configured")


@when_not("ha.connected")
@when("hacluster-configured")
def remove_hacluster():
    # get a new cert
    if is_flag_set("certificates.available"):
        send_data()
    # update workers
    if is_flag_set("kube-control.connected"):
        send_api_urls()
    if is_flag_set("kube-api-endpoint.available"):
        push_service_data()

    clear_flag("hacluster-configured")


class InvalidDnsProvider(Exception):
    def __init__(self, value):
        self.value = value


def get_dns_provider():
    valid_dns_providers = ["auto", "core-dns", "kube-dns", "none"]
    if get_version("kube-apiserver") < (1, 14):
        valid_dns_providers.remove("core-dns")

    dns_provider = hookenv.config("dns-provider").lower()
    if dns_provider not in valid_dns_providers:
        raise InvalidDnsProvider(dns_provider)

    if dns_provider == "auto":
        dns_provider = leader_get("auto_dns_provider")
        # On new deployments, the first time this is called, auto_dns_provider
        # hasn't been set yet. We need to make a choice now.
        if not dns_provider:
            if "core-dns" in valid_dns_providers:
                dns_provider = "core-dns"
            else:
                dns_provider = "kube-dns"

    # LP: 1833089. Followers end up here when setting final status; ensure only
    # leaders call leader_set.
    if is_state("leadership.is_leader"):
        leader_set(auto_dns_provider=dns_provider)
    return dns_provider


@when("kube-control.connected")
@when_not("kubernetes-control-plane.sent-registry")
def send_registry_location():
    registry_location = hookenv.config("image-registry")
    kube_control = endpoint_from_flag("kube-control.connected")

    # Send registry to workers
    kube_control.set_registry_location(registry_location)

    # Construct and send the sandbox image (pause container) to our runtime
    runtime = endpoint_from_flag("endpoint.container-runtime.available")
    if not runtime:
        hookenv.log(
            "Container runtime not yet available, will retry setting sandbox image"
        )
        return

    uri = get_sandbox_image_uri(registry_location)
    runtime.set_config(sandbox_image=uri)

    set_flag("kubernetes-control-plane.sent-registry")


@when(
    "leadership.is_leader",
    "leadership.set.kubernetes-master-addons-restart-for-ca",
    "kubernetes-control-plane.components.started",
)
def restart_addons_for_ca():
    try:
        # Get deployments/daemonsets/statefulsets
        output = kubectl(
            "get",
            "daemonset,deployment,statefulset",
            "-o",
            "json",
            "--all-namespaces",
            "-l",
            "cdk-restart-on-ca-change=true",
        ).decode("UTF-8")
        deployments = json.loads(output)["items"]

        # Get ServiceAccounts
        service_account_names = set(
            (
                deployment["metadata"]["namespace"],
                deployment["spec"]["template"]["spec"].get(
                    "serviceAccountName", "default"
                ),
            )
            for deployment in deployments
        )
        service_accounts = []
        for namespace, name in service_account_names:
            output = kubectl(
                "get", "ServiceAccount", name, "-o", "json", "-n", namespace
            ).decode("UTF-8")
            service_account = json.loads(output)
            service_accounts.append(service_account)

        # Get ServiceAccount secrets
        secret_names = set()
        for service_account in service_accounts:
            namespace = service_account["metadata"]["namespace"]
            for secret in service_account["secrets"]:
                secret_names.add((namespace, secret["name"]))
        secrets = []
        for namespace, name in secret_names:
            output = kubectl(
                "get", "Secret", name, "-o", "json", "-n", namespace
            ).decode("UTF-8")
            secret = json.loads(output)
            secrets.append(secret)

        # Check secrets have updated CA
        with open(ca_crt_path, "rb") as f:
            ca = f.read()
        encoded_ca = base64.b64encode(ca).decode("UTF-8")
        mismatched_secrets = [
            secret for secret in secrets if secret["data"]["ca.crt"] != encoded_ca
        ]
        if mismatched_secrets:
            hookenv.log(
                "ServiceAccount secrets do not have correct ca.crt: "
                + ",".join(secret["metadata"]["name"] for secret in mismatched_secrets)
            )
            hookenv.log("Waiting to retry restarting addons")
            return

        # Now restart the addons
        for deployment in deployments:
            kind = deployment["kind"]
            namespace = deployment["metadata"]["namespace"]
            name = deployment["metadata"]["name"]
            hookenv.log("Restarting addon: %s %s %s" % (kind, namespace, name))
            kubectl("rollout", "restart", kind + "/" + name, "-n", namespace)

        leader_set({"kubernetes-master-addons-restart-for-ca": None})
    except Exception:
        hookenv.log(traceback.format_exc())
        hookenv.log("Waiting to retry restarting addons")


def add_systemd_iptables_patch():
    source = "templates/kube-proxy-iptables-fix.sh"
    dest = "/usr/local/bin/kube-proxy-iptables-fix.sh"
    copyfile(source, dest)
    os.chmod(dest, 0o775)

    template = "templates/service-iptables-fix.service"
    dest_dir = "/etc/systemd/system"
    os.makedirs(dest_dir, exist_ok=True)
    service_name = "kube-proxy-iptables-fix.service"
    copyfile(template, "{}/{}".format(dest_dir, service_name))

    check_call(["systemctl", "daemon-reload"])

    # enable and run the service
    service_resume(service_name)


@when(
    "leadership.is_leader",
    "kubernetes-control-plane.components.started",
    "endpoint.prometheus.joined",
    "certificates.ca.available",
)
def register_prometheus_jobs():
    prometheus = endpoint_from_flag("endpoint.prometheus.joined")
    tls = endpoint_from_flag("certificates.ca.available")
    monitoring_token = get_token("system:monitoring")

    for relation in prometheus.relations:
        endpoints = kubernetes_control_plane.get_internal_api_endpoints(relation)
        if not endpoints:
            continue
        address, port = endpoints[0]

        templates_dir = Path("templates")
        for job_file in Path("templates/prometheus").glob("*.yaml.j2"):
            prometheus.register_job(
                relation=relation,
                job_name=job_file.name.split(".")[0],
                job_data=yaml.safe_load(
                    render(
                        source=str(job_file.relative_to(templates_dir)),
                        target=None,  # don't write file, just return data
                        context={
                            "k8s_api_address": address,
                            "k8s_api_port": port,
                            "k8s_token": monitoring_token,
                        },
                    )
                ),
                ca_cert=tls.root_ca_cert,
            )


def detect_telegraf():
    # Telegraf uses the implicit juju-info relation, which makes it difficult
    # to tell if it's related. The "best" option is to look for the subordinate
    # charm on disk.
    for charm_dir in Path("/var/lib/juju/agents").glob("unit-*/charm"):
        metadata = yaml.safe_load((charm_dir / "metadata.yaml").read_text())
        if "telegraf" in metadata["name"]:
            return True
    else:
        return False


@when(
    "leadership.is_leader",
    "kubernetes-control-plane.components.started",
    "endpoint.grafana.joined",
)
def register_grafana_dashboards():
    grafana = endpoint_from_flag("endpoint.grafana.joined")

    # load conditional dashboards
    dash_dir = Path("templates/grafana/conditional")
    if is_flag_set("endpoint.prometheus.joined"):
        dashboard = (dash_dir / "prometheus.json").read_text()
        grafana.register_dashboard("prometheus", json.loads(dashboard))
    if detect_telegraf():
        dashboard = (dash_dir / "telegraf.json").read_text()
        grafana.register_dashboard("telegraf", json.loads(dashboard))

    # load automatic dashboards
    dash_dir = Path("templates/grafana/autoload")
    for dash_file in dash_dir.glob("*.json"):
        dashboard = dash_file.read_text()
        grafana.register_dashboard(dash_file.stem, json.loads(dashboard))


@when("endpoint.aws-iam.ready")
@when_not("kubernetes-control-plane.aws-iam.configured")
def enable_aws_iam_webhook():
    # if etcd isn't available yet, we'll set this up later
    # when we start the api server.
    if is_flag_set("etcd.available"):
        # call the other things we need to update
        clear_flag("kubernetes-control-plane.apiserver.configured")
        build_kubeconfig()
    set_flag("kubernetes-control-plane.aws-iam.configured")


@when("kubernetes-control-plane.components.started", "endpoint.aws-iam.available")
def api_server_started():
    aws_iam = endpoint_from_flag("endpoint.aws-iam.available")
    if aws_iam:
        aws_iam.set_api_server_status(True)


@when_not("kubernetes-control-plane.components.started")
@when("endpoint.aws-iam.available")
def api_server_stopped():
    aws_iam = endpoint_from_flag("endpoint.aws-iam.available")
    if aws_iam:
        aws_iam.set_api_server_status(False)


@when("kube-control.connected")
def send_default_cni():
    """Send the value of the default-cni config to the kube-control relation.
    This allows kubernetes-worker to use the same config value as well.
    """
    default_cni = hookenv.config("default-cni")
    kube_control = endpoint_from_flag("kube-control.connected")
    kube_control.set_default_cni(default_cni)


@when("config.changed.default-cni")
def default_cni_changed():
    remove_state("kubernetes-control-plane.components.started")


@when(
    "kubernetes-control-plane.components.started",
    "kubernetes-control-plane.apiserver.configured",
    "endpoint.container-runtime.available",
)
@when_not("kubernetes-control-plane.kubelet.configured")
def configure_kubelet():
    uid = hookenv.local_unit()
    username = "system:node:{}".format(get_node_name().lower())
    group = "system:nodes"
    token = get_token(username)
    if not token:
        setup_tokens(None, username, uid, group)
        token = get_token(username)
    if not token:
        hookenv.log(
            "Failed to create token for {}; will retry".format(username),
            hookenv.WARNING,
        )
        return
    has_xcp = has_external_cloud_provider()

    local_endpoint = kubernetes_control_plane.get_local_api_endpoint()
    local_url = kubernetes_control_plane.get_api_url(local_endpoint)
    create_kubeconfig(
        kubelet_kubeconfig_path, local_url, ca_crt_path, token=token, user="kubelet"
    )

    dns_ready, dns_ip, dns_port, dns_domain = get_dns_info()
    if not dns_ready:
        hookenv.log("DNS not ready, waiting to configure Kubelet")
        return
    dns_info = [dns_ip, dns_port, dns_domain]
    db.set("kubernetes-master.kubelet.dns-used", dns_info)

    registry = hookenv.config("image-registry")
    taints = hookenv.config("register-with-taints").split()
    kubernetes_common.configure_kubelet(
        dns_domain, dns_ip, registry, taints=taints, has_xcp=has_xcp
    )
    service_restart("snap.kubelet.daemon")
    set_state("node.label-config-required")
    set_flag("kubernetes-control-plane.kubelet.configured")


@when(
    "node.label-config-required",
    "kubernetes-control-plane.kubelet.configured",
    "kubernetes-control-plane.apiserver.configured",
    "authentication.setup",
)
def apply_node_labels():
    # Label configuration complete.
    label_maker = LabelMaker(kubeclientconfig_path)
    try:
        label_maker.apply_node_labels()
    except LabelMaker.NodeLabelError:
        return
    remove_state("node.label-config-required")


@when_any("config.changed.kubelet-extra-args", "config.changed.kubelet-extra-config")
def reconfigure_kubelet():
    # LP bug #1826833, always delete the state file when extra config changes
    # since CPU manager doesn’t support offlining and onlining of CPUs at runtime.
    cpu_manager_state = "/var/lib/kubelet/cpu_manager_state"
    if os.path.isfile(cpu_manager_state):
        hookenv.log("Removing file: " + cpu_manager_state)
        os.remove(cpu_manager_state)
    clear_flag("kubernetes-control-plane.kubelet.configured")


@when("kubernetes-control-plane.kubelet.configured")
def watch_dns_for_changes():
    dns_ready, dns_ip, dns_port, dns_domain = get_dns_info()
    dns_info = [dns_ip, dns_port, dns_domain]
    previous_dns_info = db.get("kubernetes-master.kubelet.dns-used")
    dns_changed = dns_info != previous_dns_info
    if dns_ready and dns_changed:
        hookenv.log("DNS info has changed, will reconfigure Kubelet")
        clear_flag("kubernetes-control-plane.kubelet.configured")


@when("cni.available")
@when_not("kubernetes-control-plane.default-cni.configured")
def configure_default_cni():
    default_cni = hookenv.config("default-cni")
    kubernetes_common.configure_default_cni(default_cni)
    set_flag("kubernetes-control-plane.default-cni.configured")


@when("ceph-client.available")
@when_not("kubernetes-control-plane.ceph.permissions.requested")
def request_ceph_permissions():
    ceph_client = endpoint_from_flag("ceph-client.available")
    request = ceph_client.get_current_request() or CephBrokerRq()
    # Permissions needed for Ceph CSI
    # https://github.com/ceph/ceph-csi/blob/v3.6.0/docs/capabilities.md
    permissions = [
        "mon",
        "profile rbd, allow r",
        "mds",
        "allow rw",
        "mgr",
        "allow rw",
        "osd",
        "profile rbd, allow rw tag cephfs metadata=*",
    ]
    client_name = hookenv.application_name()
    request.add_op(
        {"op": "set-key-permissions", "permissions": permissions, "client": client_name}
    )
    ceph_client.send_request_if_needed(request)
    set_flag("kubernetes-control-plane.ceph.permissions.requested")


HEAL_HANDLER = {
    "kube-apiserver": {
        "run": configure_apiserver,
        "clear_flags": [
            "kubernetes-control-plane.apiserver.configured",
            "kubernetes-control-plane.apiserver.running",
        ],
    },
    "kube-controller-manager": {"run": configure_controller_manager, "clear_flags": []},
    "kube-scheduler": {"run": configure_scheduler, "clear_flags": []},
    "kube-proxy": {
        "run": start_control_plane,
        "clear_flags": ["kubernetes-control-plane.components.started"],
    },
    "kubelet": {"run": reconfigure_kubelet, "clear_flags": []},
}