Charmed-Kubernetes/etcd/reactive/etcd.py

#!/usr/bin/python3

from charms import layer

from charms.layer import snap

from charms.reactive import endpoint_from_flag
from charms.reactive import when
from charms.reactive import when_any
from charms.reactive import when_not
from charms.reactive import is_state
from charms.reactive import set_state
from charms.reactive import is_flag_set
from charms.reactive import remove_state
from charms.reactive import set_flag
from charms.reactive import clear_flag
from charms.reactive import hook
from charms.reactive import register_trigger
from charms.reactive.helpers import data_changed

from charmhelpers.core.templating import render

from charmhelpers.core.hookenv import config
from charmhelpers.core.hookenv import log
from charmhelpers.core.hookenv import DEBUG

from charmhelpers.core.hookenv import leader_set
from charmhelpers.core.hookenv import leader_get
from charmhelpers.core.hookenv import storage_get

from charmhelpers.core.hookenv import application_version_set
from charmhelpers.core.hookenv import open_port
from charmhelpers.core.hookenv import close_port
from charmhelpers.core.host import write_file
from charmhelpers.core import hookenv
from charmhelpers.core import host
from charmhelpers.contrib.charmsupport import nrpe

from charms.layer import status

from etcdctl import EtcdCtl
from etcdctl import get_connection_string
from etcd_databag import EtcdDatabag
from etcd_lib import (
    get_ingress_address,
    get_ingress_addresses,
    render_grafana_dashboard,
)

from shlex import split
from subprocess import check_call
from subprocess import check_output
from subprocess import CalledProcessError
from shutil import copyfile

import json
import os
import charms.leadership  # noqa
import socket
import time
import traceback
import yaml
import shutil
import random


# Layer Note:   the @when_not etcd.installed state checks are relating to
# a boundry that was superimposed by the etcd-24 release which added support
# for snaps. Snapped etcd is now the only supported mechanism by this charm.
# References to this state will be wiped sometime within the next 10 releases
# of the charm.


# Override the default nagios shortname regex to allow periods, which we
# need because our bin names contain them (e.g. 'snap.foo.daemon'). The
# default regex in charmhelpers doesn't allow periods, but nagios itself does.
nrpe.Check.shortname_re = r"[\.A-Za-z0-9-_]+$"

GRAFANA_DASHBOARD_NAME = "etcd"

register_trigger(when_not="endpoint.grafana.joined", clear_flag="grafana.configured")
register_trigger(
    when_not="endpoint.prometheus.joined", clear_flag="prometheus.configured"
)
register_trigger(when_not="endpoint.prometheus.joined", clear_flag="grafana.configured")


def get_target_etcd_channel():
    """
    Check whether or not etcd is already installed. i.e. we're
    going through an upgrade.  If so, leave the etcd version alone,
    if we're a new install, we can set the default channel here.

    If the user has specified a version, then just return that.

    :return: String snap channel
    """
    channel = hookenv.config("channel")
    if channel == "auto":
        if snap.is_installed("etcd"):
            return False
        else:
            return "3.4/stable"
    else:
        return channel


@when("etcd.installed")
def snap_upgrade_notice():
    status.blocked("Manual migration required. http://bit.ly/2oznAUZ")


@when_any("etcd.registered", "etcd.leader.configured")
@when_not("etcd.installed")
@when_not("upgrade.series.in-progress")
def check_cluster_health():
    """report on the cluster health every 5 minutes"""
    etcdctl = EtcdCtl()
    health = etcdctl.cluster_health()

    # Determine if the unit is healthy or unhealthy
    if "unhealthy" in health["status"]:
        unit_health = "UnHealthy"
    else:
        unit_health = "Healthy"

    # Determine units peer count, and surface 0 by default
    try:
        peers = len(etcdctl.member_list())
    except Exception:
        unit_health = "Errored"
        peers = 0

    bp = "{0} with {1} known peer{2}"
    status_message = bp.format(unit_health, peers, "s" if peers != 1 else "")

    if unit_health in ["UnHealthy", "Errored"]:
        status.blocked(status_message)
    else:
        status.active(status_message)


@when("snap.installed.etcd")
@when_not("etcd.installed")
def set_app_version():
    """Surface the etcd application version on juju status"""
    # note - the snap doesn't place an etcd alias on disk. This shall infer
    # the version from etcdctl, as the snap distributes both in lockstep.
    application_version_set(etcd_version())


@when_not("certificates.available")
def missing_relation_notice():
    status.blocked("Missing relation to certificate authority.")


@when("certificates.available")
def prepare_tls_certificates(tls):
    try:
        common_name = hookenv.unit_public_ip()
    except CalledProcessError as e:
        msg = "Public address not available yet"
        hookenv.log(msg, hookenv.WARNING)
        hookenv.log(e, hookenv.WARNING)
        return

    sans = set()
    sans.add(common_name)
    sans.update(get_ingress_addresses("db"))
    sans.update(get_ingress_addresses("cluster"))
    sans.add(socket.gethostname())

    # add cluster peers as alt names when present
    cluster = endpoint_from_flag("cluster.joined")
    if cluster:
        for ip in cluster.get_db_ingress_addresses():
            sans.add(ip)

    sans = sorted(sans)
    certificate_name = hookenv.local_unit().replace("/", "_")
    tls.request_server_cert(common_name, sans, certificate_name)


@hook("upgrade-charm")
def remove_states():
    # stale state cleanup (pre rev6)
    remove_state("etcd.tls.secured")
    remove_state("etcd.ssl.placed")
    remove_state("etcd.ssl.exported")
    remove_state("etcd.nrpe.configured")
    # force a config re-render in case template changed
    set_state("etcd.rerender-config")


@hook("pre-series-upgrade")
def pre_series_upgrade():
    bag = EtcdDatabag()
    host.service_pause(bag.etcd_daemon)
    status.blocked("Series upgrade in progress")


@hook("post-series-upgrade")
def post_series_upgrade():
    bag = EtcdDatabag()
    host.service_resume(bag.etcd_daemon)


@when("snap.installed.etcd")
@when("leadership.is_leader")
@when_any("config.changed.port", "config.changed.management_port")
@when_not("etcd.installed")
@when_not("upgrade.series.in-progress")
def leader_config_changed():
    """The leader executes the runtime configuration update for the cluster,
    as it is the controlling unit. Will render config, close and open ports and
    restart the etcd service."""
    configuration = hookenv.config()
    previous_port = configuration.previous("port")
    log("Previous port: {0}".format(previous_port))
    previous_mgmt_port = configuration.previous("management_port")
    log("Previous management port: {0}".format(previous_mgmt_port))

    if previous_port and previous_mgmt_port:
        bag = EtcdDatabag()
        etcdctl = EtcdCtl()
        members = etcdctl.member_list()
        # Iterate over all the members in the list.
        for unit_name in members:
            # Grab the previous peer url and replace the management port.
            peer_urls = members[unit_name]["peer_urls"]
            log("Previous peer url: {0}".format(peer_urls))
            old_port = ":{0}".format(previous_mgmt_port)
            new_port = ":{0}".format(configuration.get("management_port"))
            url = peer_urls.replace(old_port, new_port)
            # Update the member's peer_urls with the new ports.
            log(etcdctl.member_update(members[unit_name]["unit_id"], url))
        # Render just the leaders configuration with the new values.
        render_config()
        address = get_ingress_address("cluster")
        leader_set(
            {"leader_address": get_connection_string([address], bag.management_port)}
        )
        host.service_restart(bag.etcd_daemon)


@when("snap.installed.etcd")
@when_not("leadership.is_leader")
@when_any("config.changed.port", "config.changed.management_port")
@when_not("etcd.installed")
def follower_config_changed():
    """Follower units need to render the configuration file, close and open
    ports, and restart the etcd service."""
    set_state("etcd.rerender-config")


@when("snap.installed.etcd")
@when("config.changed.bind_to_all_interfaces")
@when_not("upgrade.series.in-progress")
def bind_to_all_interfaces_changed():
    set_state("etcd.rerender-config")


@when("snap.installed.etcd")
@when("config.changed.tls_cipher_suites")
@when_not("upgrade.series.in-progress")
def tls_cipher_suites_changed():
    set_state("etcd.rerender-config")


@when("etcd.rerender-config")
@when_not("upgrade.series.in-progress")
def rerender_config():
    """Config must be updated and service restarted"""
    bag = EtcdDatabag()
    log("Rendering config file for {0}".format(bag.unit_name))
    render_config()
    if host.service_running(bag.etcd_daemon):
        host.service_restart(bag.etcd_daemon)
    set_app_version()


@when("cluster.joined")
def set_db_ingress_address(cluster):
    """Send db ingress address to peers on the cluster relation"""
    address = get_ingress_address("db")
    cluster.set_db_ingress_address(address)


@when("db.connected")
@when("etcd.ssl.placed")
@when("cluster.joined")
def send_cluster_connection_details(cluster, db):
    """Need to set the cluster connection string and
    the client key and certificate on the relation object."""
    cert = read_tls_cert("client.crt")
    key = read_tls_cert("client.key")
    ca = read_tls_cert("ca.crt")
    etcdctl = EtcdCtl()

    # Set the key, cert, and ca on the db relation
    db.set_client_credentials(key, cert, ca)

    port = hookenv.config().get("port")
    # Get all the peers participating in the cluster relation.
    members = cluster.get_db_ingress_addresses()
    # Append our own address to the membership list, because peers dont self
    # actualize
    address = get_ingress_address("db")
    members.append(address)
    members.sort()
    # Create a connection string with all the members on the configured port.
    connection_string = get_connection_string(members, port)
    # Set the connection string on the db relation.
    db.set_connection_string(connection_string, version=etcdctl.version())


@when("db.connected")
@when("etcd.ssl.placed")
@when_not("cluster.joined")
def send_single_connection_details(db):
    """ """
    cert = read_tls_cert("client.crt")
    key = read_tls_cert("client.key")
    ca = read_tls_cert("ca.crt")

    etcdctl = EtcdCtl()

    # Set the key and cert on the db relation
    db.set_client_credentials(key, cert, ca)

    bag = EtcdDatabag()
    # Get all the peers participating in the cluster relation.
    address = get_ingress_address("db")
    members = [address]
    # Create a connection string with this member on the configured port.
    connection_string = get_connection_string(members, bag.port)
    # Set the connection string on the db relation.
    db.set_connection_string(connection_string, version=etcdctl.version())


@when("proxy.connected")
@when("etcd.ssl.placed")
@when_any("etcd.leader.configured", "cluster.joined")
def send_cluster_details(proxy):
    """Sends the peer cluster string to proxy units so they can join and act
    on behalf of the cluster."""
    cert = read_tls_cert("client.crt")
    key = read_tls_cert("client.key")
    ca = read_tls_cert("ca.crt")
    proxy.set_client_credentials(key, cert, ca)

    # format a list of cluster participants
    etcdctl = EtcdCtl()
    peers = etcdctl.member_list()
    cluster = []
    for peer in peers:
        thispeer = peers[peer]
        # Potential member doing registration. Default to skip
        if "peer_urls" not in thispeer.keys() or not thispeer["peer_urls"]:
            continue
        peer_string = "{}={}".format(thispeer["name"], thispeer["peer_urls"])
        cluster.append(peer_string)

    proxy.set_cluster_string(",".join(cluster))


@when("config.changed.channel")
def channel_changed():
    """Ensure that the config is updated if the channel changes."""
    set_state("etcd.rerender-config")


@when("config.changed.channel")
@when_not("etcd.installed")
def snap_install():
    channel = get_target_etcd_channel()
    snap.install("core")
    if channel:
        snap.install("etcd", channel=channel, classic=False)
        remove_state("etcd.ssl.exported")


@when("etcd.ssl.placed")
@when_not("snap.installed.etcd")
def install_etcd():
    """Attempt resource get on the "etcd" and "etcdctl" resources. If no
    resources are provided attempt to install from the archive only on the
    16.04 (xenial) series."""

    if is_state("etcd.installed"):
        msg = "Manual upgrade required. run-action snap-upgrade."
        status.blocked(msg)
        return

    status.maintenance("Installing etcd.")

    channel = get_target_etcd_channel()
    if channel:
        snap.install("etcd", channel=channel, classic=False)


@when("snap.installed.etcd")
@when_not("etcd.service-restart.configured")
@when_not("upgrade.series.in-progress")
def add_systemd_restart_always():
    template = "templates/service-always-restart.systemd-latest.conf"
    service = "snap.etcd.etcd"

    try:
        # Get the systemd version
        cmd = ["systemd", "--version"]
        output = check_output(cmd).decode("UTF-8")
        line = output.splitlines()[0]
        words = line.split()
        assert words[0] == "systemd"
        systemd_version = int(words[1])

        # Check for old version (for xenial support)
        if systemd_version < 230:
            template = "templates/service-always-restart.systemd-229.conf"
    except Exception:
        traceback.print_exc()
        hookenv.log(
            "Failed to detect systemd version, using latest template", level="ERROR"
        )

    dest_dir = "/etc/systemd/system/{}.service.d".format(service)
    os.makedirs(dest_dir, exist_ok=True)
    copyfile(template, "{}/always-restart.conf".format(dest_dir))
    check_call(["systemctl", "daemon-reload"])
    host.service_restart("{}.service".format(service))
    set_state("etcd.service-restart.configured")


@when("snap.installed.etcd")
@when("etcd.ssl.placed")
@when("cluster.joined")
@when_not("leadership.is_leader")
@when_not("etcd.registered")
@when_not("etcd.installed")
@when_not("upgrade.series.in-progress")
def register_node_with_leader(cluster):
    """
    Control flow mechanism to perform self registration with the leader.

    Before executing self registration, we must adhere to the nature of offline
    static turnup rules. If we find a GUID in the member list without peering
    information the unit will enter a race condition and must wait for a clean
    status output before we can progress to self registration.
    """
    etcdctl = EtcdCtl()
    bag = EtcdDatabag()
    leader_address = leader_get("leader_address")
    bag.leader_address = leader_address

    try:
        # Check if we are already registered. Unregister ourselves if we are so
        # we can register from scratch.
        peer_url = "https://%s:%s" % (bag.cluster_address, bag.management_port)
        members = etcdctl.member_list(leader_address)
        for _, member in members.items():
            if member["peer_urls"] == peer_url:
                log("Found member that matches our peer URL. Unregistering...")
                etcdctl.unregister(member["unit_id"], leader_address)

        # Now register.
        resp = etcdctl.register(bag.__dict__)
        bag.set_cluster(resp["cluster"])
    except EtcdCtl.CommandFailed:
        log("etcdctl.register failed, will retry")
        msg = "Waiting to retry etcd registration"
        status.waiting(msg)
        return

    render_config(bag)
    host.service_restart(bag.etcd_daemon)
    open_port(bag.port)
    set_state("etcd.registered")


@when("etcd.ssl.placed")
@when("leadership.is_leader")
@when_not("etcd.leader.configured")
@when_not("etcd.installed")
@when_not("upgrade.series.in-progress")
def initialize_new_leader():
    """Create an initial cluster string to bring up a single member cluster of
    etcd, and set the leadership data so the followers can join this one."""
    bag = EtcdDatabag()
    bag.token = bag.token
    bag.set_cluster_state("new")
    address = get_ingress_address("cluster")
    cluster_connection_string = get_connection_string([address], bag.management_port)
    bag.set_cluster("{}={}".format(bag.unit_name, cluster_connection_string))

    render_config(bag)
    host.service_restart(bag.etcd_daemon)

    # sorry, some hosts need this. The charm races with systemd and wins.
    time.sleep(2)

    # Check health status before we say we are good
    etcdctl = EtcdCtl()
    status = etcdctl.cluster_health()
    if "unhealthy" in status:
        status.blocked("Cluster not healthy.")
        return
    # We have a healthy leader, broadcast initial data-points for followers
    open_port(bag.port)
    leader_connection_string = get_connection_string([address], bag.port)
    leader_set({"leader_address": leader_connection_string, "cluster": bag.cluster})

    # set registered state since if we ever become a follower, we will not need
    # to re-register
    set_state("etcd.registered")

    # finish bootstrap delta and set configured state
    set_state("etcd.leader.configured")


@when("snap.installed.etcd")
@when("snap.refresh.set")
@when("leadership.is_leader")
def process_snapd_timer():
    """Set the snapd refresh timer on the leader so all cluster members
    (present and future) will refresh near the same time."""
    # Get the current snapd refresh timer; we know layer-snap has set this
    # when the 'snap.refresh.set' flag is present.
    timer = snap.get(snapname="core", key="refresh.timer").decode("utf-8").strip()
    if not timer:
        # The core snap timer is empty. This likely means a subordinate timer
        # reset ours. Try to set it back to a previously leader-set value,
        # falling back to config if needed. Luckily, this should only happen
        # during subordinate install, so this should remain stable afterward.
        timer = leader_get("snapd_refresh") or hookenv.config("snapd_refresh")
        snap.set_refresh_timer(timer)

        # Ensure we have the timer known by snapd (it may differ from config).
        timer = snap.get(snapname="core", key="refresh.timer").decode("utf-8").strip()

    # The first time through, data_changed will be true. Subsequent calls
    # should only update leader data if something changed.
    if data_changed("etcd_snapd_refresh", timer):
        log("setting snapd_refresh timer to: {}".format(timer))
        leader_set({"snapd_refresh": timer})


@when("snap.installed.etcd")
@when("snap.refresh.set")
@when("leadership.changed.snapd_refresh")
@when_not("leadership.is_leader")
def set_snapd_timer():
    """Set the snapd refresh.timer on non-leader cluster members."""
    # NB: This method should only be run when 'snap.refresh.set' is present.
    # Layer-snap will always set a core refresh.timer, which may not be the
    # same as our leader. Gating with 'snap.refresh.set' ensures layer-snap
    # has finished and we are free to set our config to the leader's timer.
    timer = leader_get("snapd_refresh") or ""  # None will cause error
    log("setting snapd_refresh timer to: {}".format(timer))
    snap.set_refresh_timer(timer)


@when(
    "tls_client.ca.saved",
    "tls_client.server.key.saved",
    "tls_client.server.certificate.saved",
    "tls_client.client.certificate.saved",
)
@when_not("etcd.ssl.placed")
def tls_state_control():
    """This state represents all the complexity of handling the TLS certs.
    instead of stacking decorators, this state condenses it into a single
    state we can gate on before progressing with secure setup. Also handles
    ensuring users of the system can access the TLS certificates"""

    bag = EtcdDatabag()
    if not os.path.isdir(bag.etcd_conf_dir):
        hookenv.log("Waiting for etcd conf creation.")
        return
    cmd = ["chown", "-R", "root:ubuntu", bag.etcd_conf_dir]
    check_call(cmd)
    set_state("etcd.ssl.placed")


@when("etcd.ssl.placed")
@when_any(
    "tls_client.ca.written",
    "tls_client.server.certificate.written",
    "tls_client.client.certificate.written",
)
@when_not("upgrade.series.in-progress")
def tls_update():
    """Handle changes to the TLS data by ensuring that the service is
    restarted.
    """
    # ensure config is updated with new certs and service restarted
    bag = EtcdDatabag()
    render_config(bag)
    host.service_restart(bag.etcd_daemon)

    # ensure that certs are re-echoed to the db relations
    remove_state("etcd.ssl.placed")
    remove_state("tls_client.ca.written")
    remove_state("tls_client.server.certificate.written")
    remove_state("tls_client.client.certificate.written")


@when("snap.installed.etcd")
@when_not("etcd.ssl.exported")
def render_default_user_ssl_exports():
    """Add secure credentials to default user environment configs,
    transparently adding TLS"""
    opts = layer.options("tls-client")

    ca_path = opts["ca_certificate_path"]
    client_crt = opts["client_certificate_path"]
    client_key = opts["client_key_path"]

    etcd_ver = etcd_version()
    if etcd_ver == "n/a":
        hookenv.log(
            "Unable to determine version format for etcd SSL config",
            level=hookenv.ERROR,
        )
        return
    major, minor, _ = etcd_ver.split(".")

    if int(major) >= 3 and int(minor) >= 3:
        evars = [
            "export ETCDCTL_KEY={}\n".format(client_key),
            "export ETCDCTL_CERT={}\n".format(client_crt),
            "export ETCDCTL_CACERT={}\n".format(ca_path),
        ]
    else:
        evars = [
            "export ETCDCTL_KEY_FILE={}\n".format(client_key),
            "export ETCDCTL_CERT_FILE={}\n".format(client_crt),
            "export ETCDCTL_CA_FILE={}\n".format(ca_path),
        ]

    with open("/home/ubuntu/.bash_aliases", "w") as fp:
        fp.writelines(evars)
    with open("/root/.bash_aliases", "w") as fp:
        fp.writelines(evars)

    set_state("etcd.ssl.exported")


def force_rejoin():
    """Wipe local data and rejoin new cluster formed by leader unit

    This action is required if leader unit performed snapshot restore. All
    other members must remove their local data and previous cluster
    identities and join newly formed, restored, cluster.
    """
    log("Wiping local storage and rejoining cluster")
    conf = EtcdDatabag()
    host.service_stop(conf.etcd_daemon)
    clear_flag("etcd.registered")
    etcd_data = os.path.join(conf.storage_path(), "member")
    if os.path.exists(etcd_data):
        shutil.rmtree(etcd_data)
    for _ in range(11):
        # We need randomized back-off timer because only one unit can be
        # joining at the same time
        time.sleep(random.randint(1, 10))
        register_node_with_leader(None)
        if is_flag_set("etcd.registered"):
            log("Successfully rejoined the cluster")
            break


@when("leadership.changed.force_rejoin")
@when_not("leadership.is_leader")
def force_rejoin_requested():
    force_rejoin()
    check_cluster_health()


@when("cluster-relation-broken")
def cluster_relation_broken(cluster=None):
    perform_self_unregistration()


@hook("stop")
def stop_hook():
    perform_self_unregistration(skip_exception=True)


def perform_self_unregistration(skip_exception=None):
    """Attempt self removal during unit teardown."""
    etcdctl = EtcdCtl()
    leader_address = leader_get("leader_address")
    unit_name = os.getenv("JUJU_UNIT_NAME").replace("/", "")
    members = etcdctl.member_list()
    # Self Unregistration
    loop = 0
    MAX_WAIT = 10
    while loop < MAX_WAIT:
        try:
            etcdctl.unregister(members[unit_name]["unit_id"], leader_address)
            break
        except EtcdCtl.CommandFailed as ex:
            # Randomized back-off timer to let cluster settle
            loop = loop + 1
            log("Trying to unregister self from the cluster failed, retrying...")
            if loop == MAX_WAIT:
                log(
                    "All tries for unregistration failed! Switching status to blocked..."
                )
                status.blocked("Unregistration failed for the departing unit/s.")
                if not skip_exception:
                    raise Exception("All tries for unregistration failed") from ex
            time.sleep(1)


@hook("data-storage-attached")
def format_and_mount_storage():
    """This allows users to request persistent volumes from the cloud provider
    for the purposes of disaster recovery."""
    set_state("data.volume.attached")
    # Query juju for the information about the block storage
    device_info = storage_get()
    block = device_info["location"]
    bag = EtcdDatabag()
    bag.cluster = leader_get("cluster")
    # the databag has behavior that keeps the path updated.
    # Reference the default path from layer_options.
    etcd_opts = layer.options("etcd")
    # Split the tail of the path to mount the volume 1 level before
    # the data directory.
    tail = os.path.split(bag.etcd_data_dir)[0]

    if volume_is_mounted(block):
        hookenv.log("Device is already attached to the system.")
        hookenv.log("Refusing to take action against {}".format(block))
        return

    # Format the device in non-interactive mode
    cmd = ["mkfs.ext4", device_info["location"], "-F"]
    hookenv.log("Creating filesystem on {}".format(device_info["location"]))
    hookenv.log("With command: {}".format(" ".join(cmd)))
    check_call(cmd)

    # halt etcd to perform the data-store migration
    host.service_stop(bag.etcd_daemon)

    os.makedirs(tail, exist_ok=True)
    mount_volume(block, tail)
    # handle first run during early-attach storage, pre-config-changed hook.
    os.makedirs(bag.etcd_data_dir, exist_ok=True)

    # Only attempt migration if directory exists
    if os.path.isdir(etcd_opts["etcd_data_dir"]):
        migrate_path = "{}/".format(etcd_opts["etcd_data_dir"])
        output_path = "{}/".format(bag.etcd_data_dir)
        cmd = ["rsync", "-azp", migrate_path, output_path]

        hookenv.log("Detected existing data, migrating to new location.")
        hookenv.log("With command: {}".format(" ".join(cmd)))

        check_call(cmd)

    with open("/etc/fstab", "r") as fp:
        contents = fp.readlines()

    found = 0
    # scan fstab for the device
    for line in contents:
        if block in line:
            found = found + 1

    # if device not in fstab, append so it persists through reboots
    if not found > 0:
        append = "{0} {1} ext4 defaults 0 0".format(block, tail)  # noqa
        with open("/etc/fstab", "a") as fp:
            fp.writelines([append])

    # Finally re-render the configuration and resume operation
    render_config(bag)
    host.service_restart(bag.etcd_daemon)


def read_tls_cert(cert):
    """Reads the contents of the layer-configured certificate path indicated
    by cert. Returns the utf-8 decoded contents of the file"""
    # Load the layer options for configured paths
    opts = layer.options("tls-client")

    # Retain a dict of the certificate paths
    cert_paths = {
        "ca.crt": opts["ca_certificate_path"],
        "server.crt": opts["server_certificate_path"],
        "server.key": opts["server_key_path"],
        "client.crt": opts["client_certificate_path"],
        "client.key": opts["client_key_path"],
    }

    # If requesting a cert we dont know about, raise a ValueError
    if cert not in cert_paths.keys():
        raise ValueError("No known certificate {}".format(cert))

    # Read the contents of the cert and return it in utf-8 encoded text
    with open(cert_paths[cert], "r") as fp:
        data = fp.read()
        return data


NPRE_EXTERNAL_RELATION = "nrpe-external-master"  # wokeignore:rule=master


@when(NPRE_EXTERNAL_RELATION + ".available")
@when_not(NPRE_EXTERNAL_RELATION + ".initial-config")
def initial_nrpe_config(nagios=None):
    set_state(NPRE_EXTERNAL_RELATION + ".initial-config")
    update_nrpe_config(nagios)


@when_any("config.changed.nagios_context", "config.changed.nagios_servicegroups")
def force_update_nrpe_config():
    remove_state("etcd.nrpe.configured")


@when("etcd.installed")
@when(NPRE_EXTERNAL_RELATION + ".available")
@when_not("etcd.nrpe.configured")
def update_nrpe_config(unused=None):
    # List of systemd services that will be checked
    services = ("snap.etcd.etcd",)

    # The current nrpe-external interface doesn't handle a lot of logic,
    # use the charm-helpers code for now.
    hostname = nrpe.get_nagios_hostname()
    current_unit = nrpe.get_nagios_unit_name()
    nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
    # add our first check, to alert on service failure
    nrpe.add_init_service_checks(nrpe_setup, services, current_unit)

    # add the cron job to populate the cache for our second check
    # (we cache the output of 'etcdctl alarm list' to minimise overhead)
    with open("templates/check_etcd-alarms.cron") as fp:
        write_file(
            path="/etc/cron.d/check_etcd-alarms",
            content=fp.read().encode(),
            owner="root",
            perms=0o644,
        )

    # create an empty output file for the above
    write_file(
        path="/var/lib/nagios/etcd-alarm-list.txt",
        content="",
        owner="root",
        perms=0o644,
    )

    # install the NRPE script for the above
    with open("templates/check_etcd-alarms.py") as fp:
        write_file(
            path="/usr/lib/nagios/plugins/check_etcd-alarms.py",
            content=fp.read().encode(),
            owner="root",
            perms=0o755,
        )

    # define our second check, to alert on etcd alarm status
    nrpe_setup.add_check(
        "etcd-alarms",
        "Verify etcd has no raised alarms",
        "/usr/lib/nagios/plugins/check_etcd-alarms.py",
    )

    nrpe_setup.write()
    set_state("etcd.nrpe.configured")


@when_not(NPRE_EXTERNAL_RELATION + ".available")
@when(NPRE_EXTERNAL_RELATION + ".initial-config")
def remove_nrpe_config(nagios=None):
    remove_state(NPRE_EXTERNAL_RELATION + ".initial-config")

    # List of systemd services for which the checks will be removed
    services = ("snap.etcd.etcd",)

    # The current nrpe-external interface doesn't handle a lot of logic,
    # use the charm-helpers code for now.
    hostname = nrpe.get_nagios_hostname()
    nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)

    for service in services:
        nrpe_setup.remove_check(shortname=service)


@when("endpoint.prometheus.joined", "leadership.is_leader", "certificates.ca.available")
def register_prometheus_jobs():
    # This function is not guarded with `when_not("prometheus.configured")`
    # to account for possible changes of etcd units IP adresses and for when
    # etcd units are added/removed. Repeated calls to `prometheus.register_job()`
    # have no effect unless job_data changes.
    log("Registering Prometheus metrics collection.")
    prometheus = endpoint_from_flag("endpoint.prometheus.joined")
    cluster = endpoint_from_flag("cluster.joined")

    peer_ips = cluster.get_db_ingress_addresses() if cluster else []
    peer_ips.append(get_ingress_address("db"))
    targets = ["{}:{}".format(ip, config("port")) for ip in peer_ips]
    log("Configuring Prometheus scrape targets: {}".format(targets), DEBUG)
    prometheus.register_job(
        job_name="etcd",
        job_data={
            "scheme": "https",
            "static_configs": [
                {"targets": targets},
            ],
        },
    )
    set_flag("prometheus.configured")


@when("prometheus.configured", "endpoint.grafana.joined", "leadership.is_leader")
@when_not("grafana.configured")
def register_grafana_dashboard():
    log("Configuring grafana dashboard", level=hookenv.INFO)
    grafana = endpoint_from_flag("endpoint.grafana.joined")
    prometheus = endpoint_from_flag("endpoint.prometheus.joined")

    if not prometheus:
        log(
            "Prometheus relation not available. Skipping Grafana" " configuration.",
            hookenv.WARNING,
        )
        return

    if len(prometheus.relations) > 1:
        log(
            "Multiple prometheus relations detected. Default Grafana dashboard"
            " will configure only with one of them as datasource.",
            hookenv.WARNING,
        )

    datasource = prometheus.relations[0].application_name
    dashboard = render_grafana_dashboard(datasource)

    log(
        "Rendered Grafana dashboard:\n{}".format(json.dumps(dashboard)),
        level=hookenv.DEBUG,
    )
    grafana.register_dashboard(name=GRAFANA_DASHBOARD_NAME, dashboard=dashboard)
    log('Grafana dashboard "{}" registered.'.format(GRAFANA_DASHBOARD_NAME))
    set_flag("grafana.configured")


@when("snap.installed.etcd", "data.volume.attached")
@when_not("snap.connected.removable_media")
def snap_connect_external_storage():
    snap.connect("etcd:removable-media", ":removable-media")
    set_flag("snap.connected.removable_media")


def volume_is_mounted(volume):
    """Takes a hardware path and returns true/false if it is mounted"""
    cmd = ["df", "-t", "ext4"]
    out = check_output(cmd).decode("utf-8")
    return volume in out


def mount_volume(volume, location):
    """Takes a device path and mounts it to location"""
    cmd = ["mount", volume, location]
    hookenv.log("Mounting {0} to {1}".format(volume, location))
    check_call(cmd)


def unmount_path(location):
    """Unmounts a mounted volume at path"""
    cmd = ["umount", location]
    hookenv.log("Unmounting {0}".format(location))
    check_call(cmd)


def close_open_ports():
    """Close the previous port and open the port from configuration."""
    configuration = hookenv.config()
    previous_port = configuration.previous("port")
    port = configuration.get("port")
    if previous_port is not None and previous_port != port:
        log("The port changed; closing {0} opening {1}".format(previous_port, port))
        close_port(previous_port)
        open_port(port)


def install(src, tgt):
    """This method wraps the bash "install" command"""
    return check_call(split("install {} {}".format(src, tgt)))


def render_config(bag=None):
    """Render the etcd configuration template for the given version"""
    if not bag:
        bag = EtcdDatabag()

    move_etcd_data_to_standard_location()

    v2_conf_path = "{}/etcd.conf".format(bag.etcd_conf_dir)
    v3_conf_path = "{}/etcd.conf.yml".format(bag.etcd_conf_dir)

    # probe for 2.x compatibility
    if etcd_version().startswith("2."):
        render("etcd2.conf", v2_conf_path, bag.__dict__, owner="root", group="root")
    # default to 3.x template behavior
    else:
        render("etcd3.conf", v3_conf_path, bag.__dict__, owner="root", group="root")
        if os.path.exists(v2_conf_path):
            # v3 will fail if the v2 config is left in place
            os.remove(v2_conf_path)
    # Close the previous client port and open the new one.
    close_open_ports()
    remove_state("etcd.rerender-config")


def etcd_version():
    """This method surfaces the version from etcdctl"""
    raw_output = None
    try:
        # try v3
        raw_output = (
            check_output(
                ["/snap/bin/etcd.etcdctl", "version"], env={"ETCDCTL_API": "3"}
            )
            .decode("utf-8")
            .strip()
        )
        if "No help topic for 'version'" in raw_output:
            # handle v2
            raw_output = (
                check_output(["/snap/bin/etcd.etcdctl", "--version"])
                .decode("utf-8")
                .strip()
            )
        for line in raw_output.splitlines():
            if "etcdctl version" in line:
                # "etcdctl version: 3.0.17" or "etcdctl version 2.3.8"
                version = line.split()[-1]
                return version
        hookenv.log(
            "Unable to find etcd version: {}".format(raw_output), level=hookenv.ERROR
        )
        return "n/a"
    except (ValueError, CalledProcessError):
        hookenv.log(
            "Failed to get etcd version:\n" "{}".format(traceback.format_exc()),
            level=hookenv.ERROR,
        )
        return "n/a"


def move_etcd_data_to_standard_location():
    """Moves etcd data to the standard location if it's not already located
    there. This is necessary when generating new etcd config after etcd has
    been upgraded from version 2.3 to 3.x.
    """
    bag = EtcdDatabag()
    conf_path = bag.etcd_conf_dir + "/etcd.conf.yml"
    if not os.path.exists(conf_path):
        return
    with open(conf_path) as f:
        conf = yaml.safe_load(f)
    data_dir = conf["data-dir"]
    desired_data_dir = bag.etcd_data_dir
    if data_dir != desired_data_dir:
        log("Moving etcd data from %s to %s" % (data_dir, desired_data_dir))
        host.service_stop("snap.etcd.etcd")
        for filename in os.listdir(data_dir):
            os.rename(data_dir + "/" + filename, desired_data_dir + "/" + filename)
        os.rmdir(data_dir)
        conf["data-dir"] = desired_data_dir
        with open(conf_path, "w") as f:
            yaml.dump(conf, f)
        host.service_start("snap.etcd.etcd")