1080 lines
36 KiB
Python
1080 lines
36 KiB
Python
#!/usr/bin/python3
|
|
|
|
from charms import layer
|
|
|
|
from charms.layer import snap
|
|
|
|
from charms.reactive import endpoint_from_flag
|
|
from charms.reactive import when
|
|
from charms.reactive import when_any
|
|
from charms.reactive import when_not
|
|
from charms.reactive import is_state
|
|
from charms.reactive import set_state
|
|
from charms.reactive import is_flag_set
|
|
from charms.reactive import remove_state
|
|
from charms.reactive import set_flag
|
|
from charms.reactive import clear_flag
|
|
from charms.reactive import hook
|
|
from charms.reactive import register_trigger
|
|
from charms.reactive.helpers import data_changed
|
|
|
|
from charmhelpers.core.templating import render
|
|
|
|
from charmhelpers.core.hookenv import config
|
|
from charmhelpers.core.hookenv import log
|
|
from charmhelpers.core.hookenv import DEBUG
|
|
|
|
from charmhelpers.core.hookenv import leader_set
|
|
from charmhelpers.core.hookenv import leader_get
|
|
from charmhelpers.core.hookenv import storage_get
|
|
|
|
from charmhelpers.core.hookenv import application_version_set
|
|
from charmhelpers.core.hookenv import open_port
|
|
from charmhelpers.core.hookenv import close_port
|
|
from charmhelpers.core.host import write_file
|
|
from charmhelpers.core import hookenv
|
|
from charmhelpers.core import host
|
|
from charmhelpers.contrib.charmsupport import nrpe
|
|
|
|
from charms.layer import status
|
|
|
|
from etcdctl import EtcdCtl
|
|
from etcdctl import get_connection_string
|
|
from etcd_databag import EtcdDatabag
|
|
from etcd_lib import (
|
|
get_ingress_address,
|
|
get_ingress_addresses,
|
|
render_grafana_dashboard,
|
|
)
|
|
|
|
from shlex import split
|
|
from subprocess import check_call
|
|
from subprocess import check_output
|
|
from subprocess import CalledProcessError
|
|
from shutil import copyfile
|
|
|
|
import json
|
|
import os
|
|
import charms.leadership # noqa
|
|
import socket
|
|
import time
|
|
import traceback
|
|
import yaml
|
|
import shutil
|
|
import random
|
|
|
|
|
|
# Layer Note: the @when_not etcd.installed state checks are relating to
|
|
# a boundry that was superimposed by the etcd-24 release which added support
|
|
# for snaps. Snapped etcd is now the only supported mechanism by this charm.
|
|
# References to this state will be wiped sometime within the next 10 releases
|
|
# of the charm.
|
|
|
|
|
|
# Override the default nagios shortname regex to allow periods, which we
|
|
# need because our bin names contain them (e.g. 'snap.foo.daemon'). The
|
|
# default regex in charmhelpers doesn't allow periods, but nagios itself does.
|
|
nrpe.Check.shortname_re = r"[\.A-Za-z0-9-_]+$"
|
|
|
|
GRAFANA_DASHBOARD_NAME = "etcd"
|
|
|
|
register_trigger(when_not="endpoint.grafana.joined", clear_flag="grafana.configured")
|
|
register_trigger(
|
|
when_not="endpoint.prometheus.joined", clear_flag="prometheus.configured"
|
|
)
|
|
register_trigger(when_not="endpoint.prometheus.joined", clear_flag="grafana.configured")
|
|
|
|
|
|
def get_target_etcd_channel():
|
|
"""
|
|
Check whether or not etcd is already installed. i.e. we're
|
|
going through an upgrade. If so, leave the etcd version alone,
|
|
if we're a new install, we can set the default channel here.
|
|
|
|
If the user has specified a version, then just return that.
|
|
|
|
:return: String snap channel
|
|
"""
|
|
channel = hookenv.config("channel")
|
|
if channel == "auto":
|
|
if snap.is_installed("etcd"):
|
|
return False
|
|
else:
|
|
return "3.4/stable"
|
|
else:
|
|
return channel
|
|
|
|
|
|
@when("etcd.installed")
|
|
def snap_upgrade_notice():
|
|
status.blocked("Manual migration required. http://bit.ly/2oznAUZ")
|
|
|
|
|
|
@when_any("etcd.registered", "etcd.leader.configured")
|
|
@when_not("etcd.installed")
|
|
@when_not("upgrade.series.in-progress")
|
|
def check_cluster_health():
|
|
"""report on the cluster health every 5 minutes"""
|
|
etcdctl = EtcdCtl()
|
|
health = etcdctl.cluster_health()
|
|
|
|
# Determine if the unit is healthy or unhealthy
|
|
if "unhealthy" in health["status"]:
|
|
unit_health = "UnHealthy"
|
|
else:
|
|
unit_health = "Healthy"
|
|
|
|
# Determine units peer count, and surface 0 by default
|
|
try:
|
|
peers = len(etcdctl.member_list())
|
|
except Exception:
|
|
unit_health = "Errored"
|
|
peers = 0
|
|
|
|
bp = "{0} with {1} known peer{2}"
|
|
status_message = bp.format(unit_health, peers, "s" if peers != 1 else "")
|
|
|
|
if unit_health in ["UnHealthy", "Errored"]:
|
|
status.blocked(status_message)
|
|
else:
|
|
status.active(status_message)
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when_not("etcd.installed")
|
|
def set_app_version():
|
|
"""Surface the etcd application version on juju status"""
|
|
# note - the snap doesn't place an etcd alias on disk. This shall infer
|
|
# the version from etcdctl, as the snap distributes both in lockstep.
|
|
application_version_set(etcd_version())
|
|
|
|
|
|
@when_not("certificates.available")
|
|
def missing_relation_notice():
|
|
status.blocked("Missing relation to certificate authority.")
|
|
|
|
|
|
@when("certificates.available")
|
|
def prepare_tls_certificates(tls):
|
|
try:
|
|
common_name = hookenv.unit_public_ip()
|
|
except CalledProcessError as e:
|
|
msg = "Public address not available yet"
|
|
hookenv.log(msg, hookenv.WARNING)
|
|
hookenv.log(e, hookenv.WARNING)
|
|
return
|
|
|
|
sans = set()
|
|
sans.add(common_name)
|
|
sans.update(get_ingress_addresses("db"))
|
|
sans.update(get_ingress_addresses("cluster"))
|
|
sans.add(socket.gethostname())
|
|
|
|
# add cluster peers as alt names when present
|
|
cluster = endpoint_from_flag("cluster.joined")
|
|
if cluster:
|
|
for ip in cluster.get_db_ingress_addresses():
|
|
sans.add(ip)
|
|
|
|
sans = sorted(sans)
|
|
certificate_name = hookenv.local_unit().replace("/", "_")
|
|
tls.request_server_cert(common_name, sans, certificate_name)
|
|
|
|
|
|
@hook("upgrade-charm")
|
|
def remove_states():
|
|
# stale state cleanup (pre rev6)
|
|
remove_state("etcd.tls.secured")
|
|
remove_state("etcd.ssl.placed")
|
|
remove_state("etcd.ssl.exported")
|
|
remove_state("etcd.nrpe.configured")
|
|
# force a config re-render in case template changed
|
|
set_state("etcd.rerender-config")
|
|
|
|
|
|
@hook("pre-series-upgrade")
|
|
def pre_series_upgrade():
|
|
bag = EtcdDatabag()
|
|
host.service_pause(bag.etcd_daemon)
|
|
status.blocked("Series upgrade in progress")
|
|
|
|
|
|
@hook("post-series-upgrade")
|
|
def post_series_upgrade():
|
|
bag = EtcdDatabag()
|
|
host.service_resume(bag.etcd_daemon)
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when("leadership.is_leader")
|
|
@when_any("config.changed.port", "config.changed.management_port")
|
|
@when_not("etcd.installed")
|
|
@when_not("upgrade.series.in-progress")
|
|
def leader_config_changed():
|
|
"""The leader executes the runtime configuration update for the cluster,
|
|
as it is the controlling unit. Will render config, close and open ports and
|
|
restart the etcd service."""
|
|
configuration = hookenv.config()
|
|
previous_port = configuration.previous("port")
|
|
log("Previous port: {0}".format(previous_port))
|
|
previous_mgmt_port = configuration.previous("management_port")
|
|
log("Previous management port: {0}".format(previous_mgmt_port))
|
|
|
|
if previous_port and previous_mgmt_port:
|
|
bag = EtcdDatabag()
|
|
etcdctl = EtcdCtl()
|
|
members = etcdctl.member_list()
|
|
# Iterate over all the members in the list.
|
|
for unit_name in members:
|
|
# Grab the previous peer url and replace the management port.
|
|
peer_urls = members[unit_name]["peer_urls"]
|
|
log("Previous peer url: {0}".format(peer_urls))
|
|
old_port = ":{0}".format(previous_mgmt_port)
|
|
new_port = ":{0}".format(configuration.get("management_port"))
|
|
url = peer_urls.replace(old_port, new_port)
|
|
# Update the member's peer_urls with the new ports.
|
|
log(etcdctl.member_update(members[unit_name]["unit_id"], url))
|
|
# Render just the leaders configuration with the new values.
|
|
render_config()
|
|
address = get_ingress_address("cluster")
|
|
leader_set(
|
|
{"leader_address": get_connection_string([address], bag.management_port)}
|
|
)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when_not("leadership.is_leader")
|
|
@when_any("config.changed.port", "config.changed.management_port")
|
|
@when_not("etcd.installed")
|
|
def follower_config_changed():
|
|
"""Follower units need to render the configuration file, close and open
|
|
ports, and restart the etcd service."""
|
|
set_state("etcd.rerender-config")
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when("config.changed.bind_to_all_interfaces")
|
|
@when_not("upgrade.series.in-progress")
|
|
def bind_to_all_interfaces_changed():
|
|
set_state("etcd.rerender-config")
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when("config.changed.tls_cipher_suites")
|
|
@when_not("upgrade.series.in-progress")
|
|
def tls_cipher_suites_changed():
|
|
set_state("etcd.rerender-config")
|
|
|
|
|
|
@when("etcd.rerender-config")
|
|
@when_not("upgrade.series.in-progress")
|
|
def rerender_config():
|
|
"""Config must be updated and service restarted"""
|
|
bag = EtcdDatabag()
|
|
log("Rendering config file for {0}".format(bag.unit_name))
|
|
render_config()
|
|
if host.service_running(bag.etcd_daemon):
|
|
host.service_restart(bag.etcd_daemon)
|
|
set_app_version()
|
|
|
|
|
|
@when("cluster.joined")
|
|
def set_db_ingress_address(cluster):
|
|
"""Send db ingress address to peers on the cluster relation"""
|
|
address = get_ingress_address("db")
|
|
cluster.set_db_ingress_address(address)
|
|
|
|
|
|
@when("db.connected")
|
|
@when("etcd.ssl.placed")
|
|
@when("cluster.joined")
|
|
def send_cluster_connection_details(cluster, db):
|
|
"""Need to set the cluster connection string and
|
|
the client key and certificate on the relation object."""
|
|
cert = read_tls_cert("client.crt")
|
|
key = read_tls_cert("client.key")
|
|
ca = read_tls_cert("ca.crt")
|
|
etcdctl = EtcdCtl()
|
|
|
|
# Set the key, cert, and ca on the db relation
|
|
db.set_client_credentials(key, cert, ca)
|
|
|
|
port = hookenv.config().get("port")
|
|
# Get all the peers participating in the cluster relation.
|
|
members = cluster.get_db_ingress_addresses()
|
|
# Append our own address to the membership list, because peers dont self
|
|
# actualize
|
|
address = get_ingress_address("db")
|
|
members.append(address)
|
|
members.sort()
|
|
# Create a connection string with all the members on the configured port.
|
|
connection_string = get_connection_string(members, port)
|
|
# Set the connection string on the db relation.
|
|
db.set_connection_string(connection_string, version=etcdctl.version())
|
|
|
|
|
|
@when("db.connected")
|
|
@when("etcd.ssl.placed")
|
|
@when_not("cluster.joined")
|
|
def send_single_connection_details(db):
|
|
""" """
|
|
cert = read_tls_cert("client.crt")
|
|
key = read_tls_cert("client.key")
|
|
ca = read_tls_cert("ca.crt")
|
|
|
|
etcdctl = EtcdCtl()
|
|
|
|
# Set the key and cert on the db relation
|
|
db.set_client_credentials(key, cert, ca)
|
|
|
|
bag = EtcdDatabag()
|
|
# Get all the peers participating in the cluster relation.
|
|
address = get_ingress_address("db")
|
|
members = [address]
|
|
# Create a connection string with this member on the configured port.
|
|
connection_string = get_connection_string(members, bag.port)
|
|
# Set the connection string on the db relation.
|
|
db.set_connection_string(connection_string, version=etcdctl.version())
|
|
|
|
|
|
@when("proxy.connected")
|
|
@when("etcd.ssl.placed")
|
|
@when_any("etcd.leader.configured", "cluster.joined")
|
|
def send_cluster_details(proxy):
|
|
"""Sends the peer cluster string to proxy units so they can join and act
|
|
on behalf of the cluster."""
|
|
cert = read_tls_cert("client.crt")
|
|
key = read_tls_cert("client.key")
|
|
ca = read_tls_cert("ca.crt")
|
|
proxy.set_client_credentials(key, cert, ca)
|
|
|
|
# format a list of cluster participants
|
|
etcdctl = EtcdCtl()
|
|
peers = etcdctl.member_list()
|
|
cluster = []
|
|
for peer in peers:
|
|
thispeer = peers[peer]
|
|
# Potential member doing registration. Default to skip
|
|
if "peer_urls" not in thispeer.keys() or not thispeer["peer_urls"]:
|
|
continue
|
|
peer_string = "{}={}".format(thispeer["name"], thispeer["peer_urls"])
|
|
cluster.append(peer_string)
|
|
|
|
proxy.set_cluster_string(",".join(cluster))
|
|
|
|
|
|
@when("config.changed.channel")
|
|
def channel_changed():
|
|
"""Ensure that the config is updated if the channel changes."""
|
|
set_state("etcd.rerender-config")
|
|
|
|
|
|
@when("config.changed.channel")
|
|
@when_not("etcd.installed")
|
|
def snap_install():
|
|
channel = get_target_etcd_channel()
|
|
snap.install("core")
|
|
if channel:
|
|
snap.install("etcd", channel=channel, classic=False)
|
|
remove_state("etcd.ssl.exported")
|
|
|
|
|
|
@when("etcd.ssl.placed")
|
|
@when_not("snap.installed.etcd")
|
|
def install_etcd():
|
|
"""Attempt resource get on the "etcd" and "etcdctl" resources. If no
|
|
resources are provided attempt to install from the archive only on the
|
|
16.04 (xenial) series."""
|
|
|
|
if is_state("etcd.installed"):
|
|
msg = "Manual upgrade required. run-action snap-upgrade."
|
|
status.blocked(msg)
|
|
return
|
|
|
|
status.maintenance("Installing etcd.")
|
|
|
|
channel = get_target_etcd_channel()
|
|
if channel:
|
|
snap.install("etcd", channel=channel, classic=False)
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when_not("etcd.service-restart.configured")
|
|
@when_not("upgrade.series.in-progress")
|
|
def add_systemd_restart_always():
|
|
template = "templates/service-always-restart.systemd-latest.conf"
|
|
service = "snap.etcd.etcd"
|
|
|
|
try:
|
|
# Get the systemd version
|
|
cmd = ["systemd", "--version"]
|
|
output = check_output(cmd).decode("UTF-8")
|
|
line = output.splitlines()[0]
|
|
words = line.split()
|
|
assert words[0] == "systemd"
|
|
systemd_version = int(words[1])
|
|
|
|
# Check for old version (for xenial support)
|
|
if systemd_version < 230:
|
|
template = "templates/service-always-restart.systemd-229.conf"
|
|
except Exception:
|
|
traceback.print_exc()
|
|
hookenv.log(
|
|
"Failed to detect systemd version, using latest template", level="ERROR"
|
|
)
|
|
|
|
dest_dir = "/etc/systemd/system/{}.service.d".format(service)
|
|
os.makedirs(dest_dir, exist_ok=True)
|
|
copyfile(template, "{}/always-restart.conf".format(dest_dir))
|
|
check_call(["systemctl", "daemon-reload"])
|
|
host.service_restart("{}.service".format(service))
|
|
set_state("etcd.service-restart.configured")
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when("etcd.ssl.placed")
|
|
@when("cluster.joined")
|
|
@when_not("leadership.is_leader")
|
|
@when_not("etcd.registered")
|
|
@when_not("etcd.installed")
|
|
@when_not("upgrade.series.in-progress")
|
|
def register_node_with_leader(cluster):
|
|
"""
|
|
Control flow mechanism to perform self registration with the leader.
|
|
|
|
Before executing self registration, we must adhere to the nature of offline
|
|
static turnup rules. If we find a GUID in the member list without peering
|
|
information the unit will enter a race condition and must wait for a clean
|
|
status output before we can progress to self registration.
|
|
"""
|
|
etcdctl = EtcdCtl()
|
|
bag = EtcdDatabag()
|
|
leader_address = leader_get("leader_address")
|
|
bag.leader_address = leader_address
|
|
|
|
try:
|
|
# Check if we are already registered. Unregister ourselves if we are so
|
|
# we can register from scratch.
|
|
peer_url = "https://%s:%s" % (bag.cluster_address, bag.management_port)
|
|
members = etcdctl.member_list(leader_address)
|
|
for _, member in members.items():
|
|
if member["peer_urls"] == peer_url:
|
|
log("Found member that matches our peer URL. Unregistering...")
|
|
etcdctl.unregister(member["unit_id"], leader_address)
|
|
|
|
# Now register.
|
|
resp = etcdctl.register(bag.__dict__)
|
|
bag.set_cluster(resp["cluster"])
|
|
except EtcdCtl.CommandFailed:
|
|
log("etcdctl.register failed, will retry")
|
|
msg = "Waiting to retry etcd registration"
|
|
status.waiting(msg)
|
|
return
|
|
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
open_port(bag.port)
|
|
set_state("etcd.registered")
|
|
|
|
|
|
@when("etcd.ssl.placed")
|
|
@when("leadership.is_leader")
|
|
@when_not("etcd.leader.configured")
|
|
@when_not("etcd.installed")
|
|
@when_not("upgrade.series.in-progress")
|
|
def initialize_new_leader():
|
|
"""Create an initial cluster string to bring up a single member cluster of
|
|
etcd, and set the leadership data so the followers can join this one."""
|
|
bag = EtcdDatabag()
|
|
bag.token = bag.token
|
|
bag.set_cluster_state("new")
|
|
address = get_ingress_address("cluster")
|
|
cluster_connection_string = get_connection_string([address], bag.management_port)
|
|
bag.set_cluster("{}={}".format(bag.unit_name, cluster_connection_string))
|
|
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
# sorry, some hosts need this. The charm races with systemd and wins.
|
|
time.sleep(2)
|
|
|
|
# Check health status before we say we are good
|
|
etcdctl = EtcdCtl()
|
|
status = etcdctl.cluster_health()
|
|
if "unhealthy" in status:
|
|
status.blocked("Cluster not healthy.")
|
|
return
|
|
# We have a healthy leader, broadcast initial data-points for followers
|
|
open_port(bag.port)
|
|
leader_connection_string = get_connection_string([address], bag.port)
|
|
leader_set({"leader_address": leader_connection_string, "cluster": bag.cluster})
|
|
|
|
# set registered state since if we ever become a follower, we will not need
|
|
# to re-register
|
|
set_state("etcd.registered")
|
|
|
|
# finish bootstrap delta and set configured state
|
|
set_state("etcd.leader.configured")
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when("snap.refresh.set")
|
|
@when("leadership.is_leader")
|
|
def process_snapd_timer():
|
|
"""Set the snapd refresh timer on the leader so all cluster members
|
|
(present and future) will refresh near the same time."""
|
|
# Get the current snapd refresh timer; we know layer-snap has set this
|
|
# when the 'snap.refresh.set' flag is present.
|
|
timer = snap.get(snapname="core", key="refresh.timer").decode("utf-8").strip()
|
|
if not timer:
|
|
# The core snap timer is empty. This likely means a subordinate timer
|
|
# reset ours. Try to set it back to a previously leader-set value,
|
|
# falling back to config if needed. Luckily, this should only happen
|
|
# during subordinate install, so this should remain stable afterward.
|
|
timer = leader_get("snapd_refresh") or hookenv.config("snapd_refresh")
|
|
snap.set_refresh_timer(timer)
|
|
|
|
# Ensure we have the timer known by snapd (it may differ from config).
|
|
timer = snap.get(snapname="core", key="refresh.timer").decode("utf-8").strip()
|
|
|
|
# The first time through, data_changed will be true. Subsequent calls
|
|
# should only update leader data if something changed.
|
|
if data_changed("etcd_snapd_refresh", timer):
|
|
log("setting snapd_refresh timer to: {}".format(timer))
|
|
leader_set({"snapd_refresh": timer})
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when("snap.refresh.set")
|
|
@when("leadership.changed.snapd_refresh")
|
|
@when_not("leadership.is_leader")
|
|
def set_snapd_timer():
|
|
"""Set the snapd refresh.timer on non-leader cluster members."""
|
|
# NB: This method should only be run when 'snap.refresh.set' is present.
|
|
# Layer-snap will always set a core refresh.timer, which may not be the
|
|
# same as our leader. Gating with 'snap.refresh.set' ensures layer-snap
|
|
# has finished and we are free to set our config to the leader's timer.
|
|
timer = leader_get("snapd_refresh") or "" # None will cause error
|
|
log("setting snapd_refresh timer to: {}".format(timer))
|
|
snap.set_refresh_timer(timer)
|
|
|
|
|
|
@when(
|
|
"tls_client.ca.saved",
|
|
"tls_client.server.key.saved",
|
|
"tls_client.server.certificate.saved",
|
|
"tls_client.client.certificate.saved",
|
|
)
|
|
@when_not("etcd.ssl.placed")
|
|
def tls_state_control():
|
|
"""This state represents all the complexity of handling the TLS certs.
|
|
instead of stacking decorators, this state condenses it into a single
|
|
state we can gate on before progressing with secure setup. Also handles
|
|
ensuring users of the system can access the TLS certificates"""
|
|
|
|
bag = EtcdDatabag()
|
|
if not os.path.isdir(bag.etcd_conf_dir):
|
|
hookenv.log("Waiting for etcd conf creation.")
|
|
return
|
|
cmd = ["chown", "-R", "root:ubuntu", bag.etcd_conf_dir]
|
|
check_call(cmd)
|
|
set_state("etcd.ssl.placed")
|
|
|
|
|
|
@when("etcd.ssl.placed")
|
|
@when_any(
|
|
"tls_client.ca.written",
|
|
"tls_client.server.certificate.written",
|
|
"tls_client.client.certificate.written",
|
|
)
|
|
@when_not("upgrade.series.in-progress")
|
|
def tls_update():
|
|
"""Handle changes to the TLS data by ensuring that the service is
|
|
restarted.
|
|
"""
|
|
# ensure config is updated with new certs and service restarted
|
|
bag = EtcdDatabag()
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
# ensure that certs are re-echoed to the db relations
|
|
remove_state("etcd.ssl.placed")
|
|
remove_state("tls_client.ca.written")
|
|
remove_state("tls_client.server.certificate.written")
|
|
remove_state("tls_client.client.certificate.written")
|
|
|
|
|
|
@when("snap.installed.etcd")
|
|
@when_not("etcd.ssl.exported")
|
|
def render_default_user_ssl_exports():
|
|
"""Add secure credentials to default user environment configs,
|
|
transparently adding TLS"""
|
|
opts = layer.options("tls-client")
|
|
|
|
ca_path = opts["ca_certificate_path"]
|
|
client_crt = opts["client_certificate_path"]
|
|
client_key = opts["client_key_path"]
|
|
|
|
etcd_ver = etcd_version()
|
|
if etcd_ver == "n/a":
|
|
hookenv.log(
|
|
"Unable to determine version format for etcd SSL config",
|
|
level=hookenv.ERROR,
|
|
)
|
|
return
|
|
major, minor, _ = etcd_ver.split(".")
|
|
|
|
if int(major) >= 3 and int(minor) >= 3:
|
|
evars = [
|
|
"export ETCDCTL_KEY={}\n".format(client_key),
|
|
"export ETCDCTL_CERT={}\n".format(client_crt),
|
|
"export ETCDCTL_CACERT={}\n".format(ca_path),
|
|
]
|
|
else:
|
|
evars = [
|
|
"export ETCDCTL_KEY_FILE={}\n".format(client_key),
|
|
"export ETCDCTL_CERT_FILE={}\n".format(client_crt),
|
|
"export ETCDCTL_CA_FILE={}\n".format(ca_path),
|
|
]
|
|
|
|
with open("/home/ubuntu/.bash_aliases", "w") as fp:
|
|
fp.writelines(evars)
|
|
with open("/root/.bash_aliases", "w") as fp:
|
|
fp.writelines(evars)
|
|
|
|
set_state("etcd.ssl.exported")
|
|
|
|
|
|
def force_rejoin():
|
|
"""Wipe local data and rejoin new cluster formed by leader unit
|
|
|
|
This action is required if leader unit performed snapshot restore. All
|
|
other members must remove their local data and previous cluster
|
|
identities and join newly formed, restored, cluster.
|
|
"""
|
|
log("Wiping local storage and rejoining cluster")
|
|
conf = EtcdDatabag()
|
|
host.service_stop(conf.etcd_daemon)
|
|
clear_flag("etcd.registered")
|
|
etcd_data = os.path.join(conf.storage_path(), "member")
|
|
if os.path.exists(etcd_data):
|
|
shutil.rmtree(etcd_data)
|
|
for _ in range(11):
|
|
# We need randomized back-off timer because only one unit can be
|
|
# joining at the same time
|
|
time.sleep(random.randint(1, 10))
|
|
register_node_with_leader(None)
|
|
if is_flag_set("etcd.registered"):
|
|
log("Successfully rejoined the cluster")
|
|
break
|
|
|
|
|
|
@when("leadership.changed.force_rejoin")
|
|
@when_not("leadership.is_leader")
|
|
def force_rejoin_requested():
|
|
force_rejoin()
|
|
check_cluster_health()
|
|
|
|
|
|
@when("cluster-relation-broken")
|
|
def cluster_relation_broken(cluster=None):
|
|
perform_self_unregistration()
|
|
|
|
|
|
@hook("stop")
|
|
def stop_hook():
|
|
perform_self_unregistration(skip_exception=True)
|
|
|
|
|
|
def perform_self_unregistration(skip_exception=None):
|
|
"""Attempt self removal during unit teardown."""
|
|
etcdctl = EtcdCtl()
|
|
leader_address = leader_get("leader_address")
|
|
unit_name = os.getenv("JUJU_UNIT_NAME").replace("/", "")
|
|
members = etcdctl.member_list()
|
|
# Self Unregistration
|
|
loop = 0
|
|
MAX_WAIT = 10
|
|
while loop < MAX_WAIT:
|
|
try:
|
|
etcdctl.unregister(members[unit_name]["unit_id"], leader_address)
|
|
break
|
|
except EtcdCtl.CommandFailed as ex:
|
|
# Randomized back-off timer to let cluster settle
|
|
loop = loop + 1
|
|
log("Trying to unregister self from the cluster failed, retrying...")
|
|
if loop == MAX_WAIT:
|
|
log(
|
|
"All tries for unregistration failed! Switching status to blocked..."
|
|
)
|
|
status.blocked("Unregistration failed for the departing unit/s.")
|
|
if not skip_exception:
|
|
raise Exception("All tries for unregistration failed") from ex
|
|
time.sleep(1)
|
|
|
|
|
|
@hook("data-storage-attached")
|
|
def format_and_mount_storage():
|
|
"""This allows users to request persistent volumes from the cloud provider
|
|
for the purposes of disaster recovery."""
|
|
set_state("data.volume.attached")
|
|
# Query juju for the information about the block storage
|
|
device_info = storage_get()
|
|
block = device_info["location"]
|
|
bag = EtcdDatabag()
|
|
bag.cluster = leader_get("cluster")
|
|
# the databag has behavior that keeps the path updated.
|
|
# Reference the default path from layer_options.
|
|
etcd_opts = layer.options("etcd")
|
|
# Split the tail of the path to mount the volume 1 level before
|
|
# the data directory.
|
|
tail = os.path.split(bag.etcd_data_dir)[0]
|
|
|
|
if volume_is_mounted(block):
|
|
hookenv.log("Device is already attached to the system.")
|
|
hookenv.log("Refusing to take action against {}".format(block))
|
|
return
|
|
|
|
# Format the device in non-interactive mode
|
|
cmd = ["mkfs.ext4", device_info["location"], "-F"]
|
|
hookenv.log("Creating filesystem on {}".format(device_info["location"]))
|
|
hookenv.log("With command: {}".format(" ".join(cmd)))
|
|
check_call(cmd)
|
|
|
|
# halt etcd to perform the data-store migration
|
|
host.service_stop(bag.etcd_daemon)
|
|
|
|
os.makedirs(tail, exist_ok=True)
|
|
mount_volume(block, tail)
|
|
# handle first run during early-attach storage, pre-config-changed hook.
|
|
os.makedirs(bag.etcd_data_dir, exist_ok=True)
|
|
|
|
# Only attempt migration if directory exists
|
|
if os.path.isdir(etcd_opts["etcd_data_dir"]):
|
|
migrate_path = "{}/".format(etcd_opts["etcd_data_dir"])
|
|
output_path = "{}/".format(bag.etcd_data_dir)
|
|
cmd = ["rsync", "-azp", migrate_path, output_path]
|
|
|
|
hookenv.log("Detected existing data, migrating to new location.")
|
|
hookenv.log("With command: {}".format(" ".join(cmd)))
|
|
|
|
check_call(cmd)
|
|
|
|
with open("/etc/fstab", "r") as fp:
|
|
contents = fp.readlines()
|
|
|
|
found = 0
|
|
# scan fstab for the device
|
|
for line in contents:
|
|
if block in line:
|
|
found = found + 1
|
|
|
|
# if device not in fstab, append so it persists through reboots
|
|
if not found > 0:
|
|
append = "{0} {1} ext4 defaults 0 0".format(block, tail) # noqa
|
|
with open("/etc/fstab", "a") as fp:
|
|
fp.writelines([append])
|
|
|
|
# Finally re-render the configuration and resume operation
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
|
|
def read_tls_cert(cert):
|
|
"""Reads the contents of the layer-configured certificate path indicated
|
|
by cert. Returns the utf-8 decoded contents of the file"""
|
|
# Load the layer options for configured paths
|
|
opts = layer.options("tls-client")
|
|
|
|
# Retain a dict of the certificate paths
|
|
cert_paths = {
|
|
"ca.crt": opts["ca_certificate_path"],
|
|
"server.crt": opts["server_certificate_path"],
|
|
"server.key": opts["server_key_path"],
|
|
"client.crt": opts["client_certificate_path"],
|
|
"client.key": opts["client_key_path"],
|
|
}
|
|
|
|
# If requesting a cert we dont know about, raise a ValueError
|
|
if cert not in cert_paths.keys():
|
|
raise ValueError("No known certificate {}".format(cert))
|
|
|
|
# Read the contents of the cert and return it in utf-8 encoded text
|
|
with open(cert_paths[cert], "r") as fp:
|
|
data = fp.read()
|
|
return data
|
|
|
|
|
|
NPRE_EXTERNAL_RELATION = "nrpe-external-master" # wokeignore:rule=master
|
|
|
|
|
|
@when(NPRE_EXTERNAL_RELATION + ".available")
|
|
@when_not(NPRE_EXTERNAL_RELATION + ".initial-config")
|
|
def initial_nrpe_config(nagios=None):
|
|
set_state(NPRE_EXTERNAL_RELATION + ".initial-config")
|
|
update_nrpe_config(nagios)
|
|
|
|
|
|
@when_any("config.changed.nagios_context", "config.changed.nagios_servicegroups")
|
|
def force_update_nrpe_config():
|
|
remove_state("etcd.nrpe.configured")
|
|
|
|
|
|
@when("etcd.installed")
|
|
@when(NPRE_EXTERNAL_RELATION + ".available")
|
|
@when_not("etcd.nrpe.configured")
|
|
def update_nrpe_config(unused=None):
|
|
# List of systemd services that will be checked
|
|
services = ("snap.etcd.etcd",)
|
|
|
|
# The current nrpe-external interface doesn't handle a lot of logic,
|
|
# use the charm-helpers code for now.
|
|
hostname = nrpe.get_nagios_hostname()
|
|
current_unit = nrpe.get_nagios_unit_name()
|
|
nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
|
|
# add our first check, to alert on service failure
|
|
nrpe.add_init_service_checks(nrpe_setup, services, current_unit)
|
|
|
|
# add the cron job to populate the cache for our second check
|
|
# (we cache the output of 'etcdctl alarm list' to minimise overhead)
|
|
with open("templates/check_etcd-alarms.cron") as fp:
|
|
write_file(
|
|
path="/etc/cron.d/check_etcd-alarms",
|
|
content=fp.read().encode(),
|
|
owner="root",
|
|
perms=0o644,
|
|
)
|
|
|
|
# create an empty output file for the above
|
|
write_file(
|
|
path="/var/lib/nagios/etcd-alarm-list.txt",
|
|
content="",
|
|
owner="root",
|
|
perms=0o644,
|
|
)
|
|
|
|
# install the NRPE script for the above
|
|
with open("templates/check_etcd-alarms.py") as fp:
|
|
write_file(
|
|
path="/usr/lib/nagios/plugins/check_etcd-alarms.py",
|
|
content=fp.read().encode(),
|
|
owner="root",
|
|
perms=0o755,
|
|
)
|
|
|
|
# define our second check, to alert on etcd alarm status
|
|
nrpe_setup.add_check(
|
|
"etcd-alarms",
|
|
"Verify etcd has no raised alarms",
|
|
"/usr/lib/nagios/plugins/check_etcd-alarms.py",
|
|
)
|
|
|
|
nrpe_setup.write()
|
|
set_state("etcd.nrpe.configured")
|
|
|
|
|
|
@when_not(NPRE_EXTERNAL_RELATION + ".available")
|
|
@when(NPRE_EXTERNAL_RELATION + ".initial-config")
|
|
def remove_nrpe_config(nagios=None):
|
|
remove_state(NPRE_EXTERNAL_RELATION + ".initial-config")
|
|
|
|
# List of systemd services for which the checks will be removed
|
|
services = ("snap.etcd.etcd",)
|
|
|
|
# The current nrpe-external interface doesn't handle a lot of logic,
|
|
# use the charm-helpers code for now.
|
|
hostname = nrpe.get_nagios_hostname()
|
|
nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
|
|
|
|
for service in services:
|
|
nrpe_setup.remove_check(shortname=service)
|
|
|
|
|
|
@when("endpoint.prometheus.joined", "leadership.is_leader", "certificates.ca.available")
|
|
def register_prometheus_jobs():
|
|
# This function is not guarded with `when_not("prometheus.configured")`
|
|
# to account for possible changes of etcd units IP adresses and for when
|
|
# etcd units are added/removed. Repeated calls to `prometheus.register_job()`
|
|
# have no effect unless job_data changes.
|
|
log("Registering Prometheus metrics collection.")
|
|
prometheus = endpoint_from_flag("endpoint.prometheus.joined")
|
|
cluster = endpoint_from_flag("cluster.joined")
|
|
|
|
peer_ips = cluster.get_db_ingress_addresses() if cluster else []
|
|
peer_ips.append(get_ingress_address("db"))
|
|
targets = ["{}:{}".format(ip, config("port")) for ip in peer_ips]
|
|
log("Configuring Prometheus scrape targets: {}".format(targets), DEBUG)
|
|
prometheus.register_job(
|
|
job_name="etcd",
|
|
job_data={
|
|
"scheme": "https",
|
|
"static_configs": [
|
|
{"targets": targets},
|
|
],
|
|
},
|
|
)
|
|
set_flag("prometheus.configured")
|
|
|
|
|
|
@when("prometheus.configured", "endpoint.grafana.joined", "leadership.is_leader")
|
|
@when_not("grafana.configured")
|
|
def register_grafana_dashboard():
|
|
log("Configuring grafana dashboard", level=hookenv.INFO)
|
|
grafana = endpoint_from_flag("endpoint.grafana.joined")
|
|
prometheus = endpoint_from_flag("endpoint.prometheus.joined")
|
|
|
|
if not prometheus:
|
|
log(
|
|
"Prometheus relation not available. Skipping Grafana" " configuration.",
|
|
hookenv.WARNING,
|
|
)
|
|
return
|
|
|
|
if len(prometheus.relations) > 1:
|
|
log(
|
|
"Multiple prometheus relations detected. Default Grafana dashboard"
|
|
" will configure only with one of them as datasource.",
|
|
hookenv.WARNING,
|
|
)
|
|
|
|
datasource = prometheus.relations[0].application_name
|
|
dashboard = render_grafana_dashboard(datasource)
|
|
|
|
log(
|
|
"Rendered Grafana dashboard:\n{}".format(json.dumps(dashboard)),
|
|
level=hookenv.DEBUG,
|
|
)
|
|
grafana.register_dashboard(name=GRAFANA_DASHBOARD_NAME, dashboard=dashboard)
|
|
log('Grafana dashboard "{}" registered.'.format(GRAFANA_DASHBOARD_NAME))
|
|
set_flag("grafana.configured")
|
|
|
|
|
|
@when("snap.installed.etcd", "data.volume.attached")
|
|
@when_not("snap.connected.removable_media")
|
|
def snap_connect_external_storage():
|
|
snap.connect("etcd:removable-media", ":removable-media")
|
|
set_flag("snap.connected.removable_media")
|
|
|
|
|
|
def volume_is_mounted(volume):
|
|
"""Takes a hardware path and returns true/false if it is mounted"""
|
|
cmd = ["df", "-t", "ext4"]
|
|
out = check_output(cmd).decode("utf-8")
|
|
return volume in out
|
|
|
|
|
|
def mount_volume(volume, location):
|
|
"""Takes a device path and mounts it to location"""
|
|
cmd = ["mount", volume, location]
|
|
hookenv.log("Mounting {0} to {1}".format(volume, location))
|
|
check_call(cmd)
|
|
|
|
|
|
def unmount_path(location):
|
|
"""Unmounts a mounted volume at path"""
|
|
cmd = ["umount", location]
|
|
hookenv.log("Unmounting {0}".format(location))
|
|
check_call(cmd)
|
|
|
|
|
|
def close_open_ports():
|
|
"""Close the previous port and open the port from configuration."""
|
|
configuration = hookenv.config()
|
|
previous_port = configuration.previous("port")
|
|
port = configuration.get("port")
|
|
if previous_port is not None and previous_port != port:
|
|
log("The port changed; closing {0} opening {1}".format(previous_port, port))
|
|
close_port(previous_port)
|
|
open_port(port)
|
|
|
|
|
|
def install(src, tgt):
|
|
"""This method wraps the bash "install" command"""
|
|
return check_call(split("install {} {}".format(src, tgt)))
|
|
|
|
|
|
def render_config(bag=None):
|
|
"""Render the etcd configuration template for the given version"""
|
|
if not bag:
|
|
bag = EtcdDatabag()
|
|
|
|
move_etcd_data_to_standard_location()
|
|
|
|
v2_conf_path = "{}/etcd.conf".format(bag.etcd_conf_dir)
|
|
v3_conf_path = "{}/etcd.conf.yml".format(bag.etcd_conf_dir)
|
|
|
|
# probe for 2.x compatibility
|
|
if etcd_version().startswith("2."):
|
|
render("etcd2.conf", v2_conf_path, bag.__dict__, owner="root", group="root")
|
|
# default to 3.x template behavior
|
|
else:
|
|
render("etcd3.conf", v3_conf_path, bag.__dict__, owner="root", group="root")
|
|
if os.path.exists(v2_conf_path):
|
|
# v3 will fail if the v2 config is left in place
|
|
os.remove(v2_conf_path)
|
|
# Close the previous client port and open the new one.
|
|
close_open_ports()
|
|
remove_state("etcd.rerender-config")
|
|
|
|
|
|
def etcd_version():
|
|
"""This method surfaces the version from etcdctl"""
|
|
raw_output = None
|
|
try:
|
|
# try v3
|
|
raw_output = (
|
|
check_output(
|
|
["/snap/bin/etcd.etcdctl", "version"], env={"ETCDCTL_API": "3"}
|
|
)
|
|
.decode("utf-8")
|
|
.strip()
|
|
)
|
|
if "No help topic for 'version'" in raw_output:
|
|
# handle v2
|
|
raw_output = (
|
|
check_output(["/snap/bin/etcd.etcdctl", "--version"])
|
|
.decode("utf-8")
|
|
.strip()
|
|
)
|
|
for line in raw_output.splitlines():
|
|
if "etcdctl version" in line:
|
|
# "etcdctl version: 3.0.17" or "etcdctl version 2.3.8"
|
|
version = line.split()[-1]
|
|
return version
|
|
hookenv.log(
|
|
"Unable to find etcd version: {}".format(raw_output), level=hookenv.ERROR
|
|
)
|
|
return "n/a"
|
|
except (ValueError, CalledProcessError):
|
|
hookenv.log(
|
|
"Failed to get etcd version:\n" "{}".format(traceback.format_exc()),
|
|
level=hookenv.ERROR,
|
|
)
|
|
return "n/a"
|
|
|
|
|
|
def move_etcd_data_to_standard_location():
|
|
"""Moves etcd data to the standard location if it's not already located
|
|
there. This is necessary when generating new etcd config after etcd has
|
|
been upgraded from version 2.3 to 3.x.
|
|
"""
|
|
bag = EtcdDatabag()
|
|
conf_path = bag.etcd_conf_dir + "/etcd.conf.yml"
|
|
if not os.path.exists(conf_path):
|
|
return
|
|
with open(conf_path) as f:
|
|
conf = yaml.safe_load(f)
|
|
data_dir = conf["data-dir"]
|
|
desired_data_dir = bag.etcd_data_dir
|
|
if data_dir != desired_data_dir:
|
|
log("Moving etcd data from %s to %s" % (data_dir, desired_data_dir))
|
|
host.service_stop("snap.etcd.etcd")
|
|
for filename in os.listdir(data_dir):
|
|
os.rename(data_dir + "/" + filename, desired_data_dir + "/" + filename)
|
|
os.rmdir(data_dir)
|
|
conf["data-dir"] = desired_data_dir
|
|
with open(conf_path, "w") as f:
|
|
yaml.dump(conf, f)
|
|
host.service_start("snap.etcd.etcd")
|