942 lines
32 KiB
Python
942 lines
32 KiB
Python
#!/usr/bin/python3
|
|
|
|
from charms import layer
|
|
|
|
from charms.layer import snap
|
|
|
|
from charms.reactive import endpoint_from_flag
|
|
from charms.reactive import when
|
|
from charms.reactive import when_any
|
|
from charms.reactive import when_not
|
|
from charms.reactive import is_state
|
|
from charms.reactive import set_state
|
|
from charms.reactive import is_flag_set
|
|
from charms.reactive import clear_flag
|
|
from charms.reactive import remove_state
|
|
from charms.reactive import hook
|
|
from charms.reactive.helpers import data_changed
|
|
|
|
from charms.templating.jinja2 import render
|
|
|
|
from charmhelpers.core.hookenv import log
|
|
from charmhelpers.core.hookenv import leader_set
|
|
from charmhelpers.core.hookenv import leader_get
|
|
from charmhelpers.core.hookenv import storage_get
|
|
|
|
from charmhelpers.core.hookenv import application_version_set
|
|
from charmhelpers.core.hookenv import open_port
|
|
from charmhelpers.core.hookenv import close_port
|
|
from charmhelpers.core.host import write_file
|
|
from charmhelpers.core import hookenv
|
|
from charmhelpers.core import host
|
|
from charmhelpers.contrib.charmsupport import nrpe
|
|
|
|
from charms.layer import status
|
|
|
|
from etcdctl import EtcdCtl
|
|
from etcdctl import get_connection_string
|
|
from etcd_databag import EtcdDatabag
|
|
from etcd_lib import get_ingress_address, get_ingress_addresses
|
|
|
|
from shlex import split
|
|
from subprocess import check_call
|
|
from subprocess import check_output
|
|
from subprocess import CalledProcessError
|
|
from shutil import copyfile
|
|
|
|
import os
|
|
import charms.leadership # noqa
|
|
import socket
|
|
import time
|
|
import traceback
|
|
import yaml
|
|
import shutil
|
|
import random
|
|
|
|
|
|
# Layer Note: the @when_not etcd.installed state checks are relating to
|
|
# a boundry that was superimposed by the etcd-24 release which added support
|
|
# for snaps. Snapped etcd is now the only supported mechanism by this charm.
|
|
# References to this state will be wiped sometime within the next 10 releases
|
|
# of the charm.
|
|
|
|
|
|
# Override the default nagios shortname regex to allow periods, which we
|
|
# need because our bin names contain them (e.g. 'snap.foo.daemon'). The
|
|
# default regex in charmhelpers doesn't allow periods, but nagios itself does.
|
|
nrpe.Check.shortname_re = r'[\.A-Za-z0-9-_]+$'
|
|
|
|
|
|
def get_target_etcd_channel():
|
|
"""
|
|
Check whether or not etcd is already installed. i.e. we're
|
|
going through an upgrade. If so, leave the etcd version alone,
|
|
if we're a new install, we can set the default channel here.
|
|
|
|
If the user has specified a version, then just return that.
|
|
|
|
:return: String snap channel
|
|
"""
|
|
channel = hookenv.config('channel')
|
|
if channel == 'auto':
|
|
if snap.is_installed('etcd'):
|
|
return False
|
|
else:
|
|
return '3.4/stable'
|
|
else:
|
|
return channel
|
|
|
|
|
|
@when('etcd.installed')
|
|
def snap_upgrade_notice():
|
|
status.blocked('Manual migration required. http://bit.ly/2oznAUZ')
|
|
|
|
|
|
@when_any('etcd.registered', 'etcd.leader.configured')
|
|
@when_not('etcd.installed')
|
|
@when_not('upgrade.series.in-progress')
|
|
def check_cluster_health():
|
|
''' report on the cluster health every 5 minutes'''
|
|
etcdctl = EtcdCtl()
|
|
health = etcdctl.cluster_health()
|
|
|
|
# Determine if the unit is healthy or unhealthy
|
|
if 'unhealthy' in health['status']:
|
|
unit_health = "UnHealthy"
|
|
else:
|
|
unit_health = "Healthy"
|
|
|
|
# Determine units peer count, and surface 0 by default
|
|
try:
|
|
peers = len(etcdctl.member_list())
|
|
except Exception:
|
|
unit_health = "Errored"
|
|
peers = 0
|
|
|
|
bp = "{0} with {1} known peer{2}"
|
|
status_message = bp.format(unit_health, peers, 's' if peers != 1 else '')
|
|
|
|
status.active(status_message)
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when_not('etcd.installed')
|
|
def set_app_version():
|
|
''' Surface the etcd application version on juju status '''
|
|
# note - the snap doesn't place an etcd alias on disk. This shall infer
|
|
# the version from etcdctl, as the snap distributes both in lockstep.
|
|
application_version_set(etcd_version())
|
|
|
|
|
|
@when_not('certificates.available')
|
|
def missing_relation_notice():
|
|
status.blocked('Missing relation to certificate authority.')
|
|
|
|
|
|
@when('certificates.available')
|
|
def prepare_tls_certificates(tls):
|
|
common_name = hookenv.unit_public_ip()
|
|
sans = set()
|
|
sans.add(hookenv.unit_public_ip())
|
|
sans.update(get_ingress_addresses('db'))
|
|
sans.update(get_ingress_addresses('cluster'))
|
|
sans.add(socket.gethostname())
|
|
|
|
# add cluster peers as alt names when present
|
|
cluster = endpoint_from_flag('cluster.joined')
|
|
if cluster:
|
|
for ip in cluster.get_db_ingress_addresses():
|
|
sans.add(ip)
|
|
|
|
sans = sorted(sans)
|
|
certificate_name = hookenv.local_unit().replace('/', '_')
|
|
tls.request_server_cert(common_name, sans, certificate_name)
|
|
|
|
|
|
@hook('upgrade-charm')
|
|
def remove_states():
|
|
# stale state cleanup (pre rev6)
|
|
remove_state('etcd.tls.secured')
|
|
remove_state('etcd.ssl.placed')
|
|
remove_state('etcd.ssl.exported')
|
|
remove_state('etcd.nrpe.configured')
|
|
# force a config re-render in case template changed
|
|
set_state('etcd.rerender-config')
|
|
|
|
|
|
@hook('pre-series-upgrade')
|
|
def pre_series_upgrade():
|
|
bag = EtcdDatabag()
|
|
host.service_pause(bag.etcd_daemon)
|
|
status.blocked('Series upgrade in progress')
|
|
|
|
|
|
@hook('post-series-upgrade')
|
|
def post_series_upgrade():
|
|
bag = EtcdDatabag()
|
|
host.service_resume(bag.etcd_daemon)
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when('leadership.is_leader')
|
|
@when_any('config.changed.port', 'config.changed.management_port')
|
|
@when_not('etcd.installed')
|
|
@when_not('upgrade.series.in-progress')
|
|
def leader_config_changed():
|
|
''' The leader executes the runtime configuration update for the cluster,
|
|
as it is the controlling unit. Will render config, close and open ports and
|
|
restart the etcd service.'''
|
|
configuration = hookenv.config()
|
|
previous_port = configuration.previous('port')
|
|
log('Previous port: {0}'.format(previous_port))
|
|
previous_mgmt_port = configuration.previous('management_port')
|
|
log('Previous management port: {0}'.format(previous_mgmt_port))
|
|
|
|
if previous_port and previous_mgmt_port:
|
|
bag = EtcdDatabag()
|
|
etcdctl = EtcdCtl()
|
|
members = etcdctl.member_list()
|
|
# Iterate over all the members in the list.
|
|
for unit_name in members:
|
|
# Grab the previous peer url and replace the management port.
|
|
peer_urls = members[unit_name]['peer_urls']
|
|
log('Previous peer url: {0}'.format(peer_urls))
|
|
old_port = ':{0}'.format(previous_mgmt_port)
|
|
new_port = ':{0}'.format(configuration.get('management_port'))
|
|
url = peer_urls.replace(old_port, new_port)
|
|
# Update the member's peer_urls with the new ports.
|
|
log(etcdctl.member_update(members[unit_name]['unit_id'], url))
|
|
# Render just the leaders configuration with the new values.
|
|
render_config()
|
|
address = get_ingress_address('cluster')
|
|
leader_set({'leader_address':
|
|
get_connection_string([address],
|
|
bag.management_port)})
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when_not('leadership.is_leader')
|
|
@when_any('config.changed.port', 'config.changed.management_port')
|
|
@when_not('etcd.installed')
|
|
def follower_config_changed():
|
|
''' Follower units need to render the configuration file, close and open
|
|
ports, and restart the etcd service. '''
|
|
set_state('etcd.rerender-config')
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when('config.changed.bind_to_all_interfaces')
|
|
@when_not('upgrade.series.in-progress')
|
|
def bind_to_all_interfaces_changed():
|
|
set_state('etcd.rerender-config')
|
|
|
|
|
|
@when('etcd.rerender-config')
|
|
@when_not('upgrade.series.in-progress')
|
|
def rerender_config():
|
|
''' Config must be updated and service restarted '''
|
|
bag = EtcdDatabag()
|
|
log('Rendering config file for {0}'.format(bag.unit_name))
|
|
render_config()
|
|
if host.service_running(bag.etcd_daemon):
|
|
host.service_restart(bag.etcd_daemon)
|
|
set_app_version()
|
|
|
|
|
|
@when('cluster.joined')
|
|
def set_db_ingress_address(cluster):
|
|
''' Send db ingress address to peers on the cluster relation '''
|
|
address = get_ingress_address('db')
|
|
cluster.set_db_ingress_address(address)
|
|
|
|
|
|
@when('db.connected')
|
|
@when('etcd.ssl.placed')
|
|
@when('cluster.joined')
|
|
def send_cluster_connection_details(cluster, db):
|
|
''' Need to set the cluster connection string and
|
|
the client key and certificate on the relation object. '''
|
|
cert = read_tls_cert('client.crt')
|
|
key = read_tls_cert('client.key')
|
|
ca = read_tls_cert('ca.crt')
|
|
etcdctl = EtcdCtl()
|
|
|
|
# Set the key, cert, and ca on the db relation
|
|
db.set_client_credentials(key, cert, ca)
|
|
|
|
port = hookenv.config().get('port')
|
|
# Get all the peers participating in the cluster relation.
|
|
members = cluster.get_db_ingress_addresses()
|
|
# Append our own address to the membership list, because peers dont self
|
|
# actualize
|
|
address = get_ingress_address('db')
|
|
members.append(address)
|
|
members.sort()
|
|
# Create a connection string with all the members on the configured port.
|
|
connection_string = get_connection_string(members, port)
|
|
# Set the connection string on the db relation.
|
|
db.set_connection_string(connection_string, version=etcdctl.version())
|
|
|
|
|
|
@when('db.connected')
|
|
@when('etcd.ssl.placed')
|
|
@when_not('cluster.joined')
|
|
def send_single_connection_details(db):
|
|
''' '''
|
|
cert = read_tls_cert('client.crt')
|
|
key = read_tls_cert('client.key')
|
|
ca = read_tls_cert('ca.crt')
|
|
|
|
etcdctl = EtcdCtl()
|
|
|
|
# Set the key and cert on the db relation
|
|
db.set_client_credentials(key, cert, ca)
|
|
|
|
bag = EtcdDatabag()
|
|
# Get all the peers participating in the cluster relation.
|
|
address = get_ingress_address('db')
|
|
members = [address]
|
|
# Create a connection string with this member on the configured port.
|
|
connection_string = get_connection_string(members, bag.port)
|
|
# Set the connection string on the db relation.
|
|
db.set_connection_string(connection_string, version=etcdctl.version())
|
|
|
|
|
|
@when('proxy.connected')
|
|
@when('etcd.ssl.placed')
|
|
@when_any('etcd.leader.configured', 'cluster.joined')
|
|
def send_cluster_details(proxy):
|
|
''' Sends the peer cluster string to proxy units so they can join and act
|
|
on behalf of the cluster. '''
|
|
cert = read_tls_cert('client.crt')
|
|
key = read_tls_cert('client.key')
|
|
ca = read_tls_cert('ca.crt')
|
|
proxy.set_client_credentials(key, cert, ca)
|
|
|
|
# format a list of cluster participants
|
|
etcdctl = EtcdCtl()
|
|
peers = etcdctl.member_list()
|
|
cluster = []
|
|
for peer in peers:
|
|
thispeer = peers[peer]
|
|
# Potential member doing registration. Default to skip
|
|
if 'peer_urls' not in thispeer.keys() or not thispeer['peer_urls']:
|
|
continue
|
|
peer_string = "{}={}".format(thispeer['name'], thispeer['peer_urls'])
|
|
cluster.append(peer_string)
|
|
|
|
proxy.set_cluster_string(','.join(cluster))
|
|
|
|
|
|
@when('config.changed.channel')
|
|
def channel_changed():
|
|
''' Ensure that the config is updated if the channel changes. '''
|
|
set_state('etcd.rerender-config')
|
|
|
|
|
|
@when('config.changed.channel')
|
|
@when_not('etcd.installed')
|
|
def snap_install():
|
|
channel = get_target_etcd_channel()
|
|
snap.install('core')
|
|
if channel:
|
|
snap.install('etcd', channel=channel, classic=False)
|
|
remove_state('etcd.ssl.exported')
|
|
|
|
|
|
@when('etcd.ssl.placed')
|
|
@when_not('snap.installed.etcd')
|
|
def install_etcd():
|
|
''' Attempt resource get on the "etcd" and "etcdctl" resources. If no
|
|
resources are provided attempt to install from the archive only on the
|
|
16.04 (xenial) series. '''
|
|
|
|
if is_state('etcd.installed'):
|
|
msg = 'Manual upgrade required. run-action snap-upgrade.'
|
|
status.blocked(msg)
|
|
return
|
|
|
|
status.maintenance('Installing etcd.')
|
|
|
|
channel = get_target_etcd_channel()
|
|
if channel:
|
|
snap.install('etcd', channel=channel, classic=False)
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when_not('etcd.service-restart.configured')
|
|
@when_not('upgrade.series.in-progress')
|
|
def add_systemd_restart_always():
|
|
template = 'templates/service-always-restart.systemd-latest.conf'
|
|
service = 'snap.etcd.etcd'
|
|
|
|
try:
|
|
# Get the systemd version
|
|
cmd = ['systemd', '--version']
|
|
output = check_output(cmd).decode('UTF-8')
|
|
line = output.splitlines()[0]
|
|
words = line.split()
|
|
assert words[0] == 'systemd'
|
|
systemd_version = int(words[1])
|
|
|
|
# Check for old version (for xenial support)
|
|
if systemd_version < 230:
|
|
template = 'templates/service-always-restart.systemd-229.conf'
|
|
except Exception:
|
|
traceback.print_exc()
|
|
hookenv.log('Failed to detect systemd version, using latest template',
|
|
level='ERROR')
|
|
|
|
dest_dir = '/etc/systemd/system/{}.service.d'.format(service)
|
|
os.makedirs(dest_dir, exist_ok=True)
|
|
copyfile(template, '{}/always-restart.conf'.format(dest_dir))
|
|
check_call(['systemctl', 'daemon-reload'])
|
|
host.service_restart('{}.service'.format(service))
|
|
set_state('etcd.service-restart.configured')
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when('etcd.ssl.placed')
|
|
@when('cluster.joined')
|
|
@when_not('leadership.is_leader')
|
|
@when_not('etcd.registered')
|
|
@when_not('etcd.installed')
|
|
@when_not('upgrade.series.in-progress')
|
|
def register_node_with_leader(cluster):
|
|
'''
|
|
Control flow mechanism to perform self registration with the leader.
|
|
|
|
Before executing self registration, we must adhere to the nature of offline
|
|
static turnup rules. If we find a GUID in the member list without peering
|
|
information the unit will enter a race condition and must wait for a clean
|
|
status output before we can progress to self registration.
|
|
'''
|
|
etcdctl = EtcdCtl()
|
|
bag = EtcdDatabag()
|
|
leader_address = leader_get('leader_address')
|
|
bag.leader_address = leader_address
|
|
|
|
try:
|
|
# Check if we are already registered. Unregister ourselves if we are so
|
|
# we can register from scratch.
|
|
peer_url = 'https://%s:%s' % (bag.cluster_address, bag.management_port)
|
|
members = etcdctl.member_list(leader_address)
|
|
for _, member in members.items():
|
|
if member['peer_urls'] == peer_url:
|
|
log('Found member that matches our peer URL. Unregistering...')
|
|
etcdctl.unregister(member['unit_id'], leader_address)
|
|
|
|
# Now register.
|
|
resp = etcdctl.register(bag.__dict__)
|
|
bag.set_cluster(resp['cluster'])
|
|
except EtcdCtl.CommandFailed:
|
|
log('etcdctl.register failed, will retry')
|
|
msg = 'Waiting to retry etcd registration'
|
|
status.waiting(msg)
|
|
return
|
|
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
open_port(bag.port)
|
|
set_state('etcd.registered')
|
|
|
|
|
|
@when('etcd.ssl.placed')
|
|
@when('leadership.is_leader')
|
|
@when_not('etcd.leader.configured')
|
|
@when_not('etcd.installed')
|
|
@when_not('upgrade.series.in-progress')
|
|
def initialize_new_leader():
|
|
''' Create an initial cluster string to bring up a single member cluster of
|
|
etcd, and set the leadership data so the followers can join this one. '''
|
|
bag = EtcdDatabag()
|
|
bag.token = bag.token
|
|
bag.set_cluster_state('new')
|
|
address = get_ingress_address('cluster')
|
|
cluster_connection_string = get_connection_string([address],
|
|
bag.management_port)
|
|
bag.set_cluster("{}={}".format(bag.unit_name, cluster_connection_string))
|
|
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
# sorry, some hosts need this. The charm races with systemd and wins.
|
|
time.sleep(2)
|
|
|
|
# Check health status before we say we are good
|
|
etcdctl = EtcdCtl()
|
|
status = etcdctl.cluster_health()
|
|
if 'unhealthy' in status:
|
|
status.blocked('Cluster not healthy.')
|
|
return
|
|
# We have a healthy leader, broadcast initial data-points for followers
|
|
open_port(bag.port)
|
|
leader_connection_string = get_connection_string([address],
|
|
bag.port)
|
|
leader_set({'leader_address': leader_connection_string,
|
|
'cluster': bag.cluster})
|
|
|
|
# set registered state since if we ever become a follower, we will not need
|
|
# to re-register
|
|
set_state('etcd.registered')
|
|
|
|
# finish bootstrap delta and set configured state
|
|
set_state('etcd.leader.configured')
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when('snap.refresh.set')
|
|
@when('leadership.is_leader')
|
|
def process_snapd_timer():
|
|
''' Set the snapd refresh timer on the leader so all cluster members
|
|
(present and future) will refresh near the same time. '''
|
|
# Get the current snapd refresh timer; we know layer-snap has set this
|
|
# when the 'snap.refresh.set' flag is present.
|
|
timer = snap.get(snapname='core', key='refresh.timer').decode('utf-8').strip()
|
|
if not timer:
|
|
# The core snap timer is empty. This likely means a subordinate timer
|
|
# reset ours. Try to set it back to a previously leader-set value,
|
|
# falling back to config if needed. Luckily, this should only happen
|
|
# during subordinate install, so this should remain stable afterward.
|
|
timer = leader_get('snapd_refresh') or hookenv.config('snapd_refresh')
|
|
snap.set_refresh_timer(timer)
|
|
|
|
# Ensure we have the timer known by snapd (it may differ from config).
|
|
timer = snap.get(snapname='core', key='refresh.timer').decode('utf-8').strip()
|
|
|
|
# The first time through, data_changed will be true. Subsequent calls
|
|
# should only update leader data if something changed.
|
|
if data_changed('etcd_snapd_refresh', timer):
|
|
log('setting snapd_refresh timer to: {}'.format(timer))
|
|
leader_set({'snapd_refresh': timer})
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when('snap.refresh.set')
|
|
@when('leadership.changed.snapd_refresh')
|
|
@when_not('leadership.is_leader')
|
|
def set_snapd_timer():
|
|
''' Set the snapd refresh.timer on non-leader cluster members. '''
|
|
# NB: This method should only be run when 'snap.refresh.set' is present.
|
|
# Layer-snap will always set a core refresh.timer, which may not be the
|
|
# same as our leader. Gating with 'snap.refresh.set' ensures layer-snap
|
|
# has finished and we are free to set our config to the leader's timer.
|
|
timer = leader_get('snapd_refresh') or '' # None will cause error
|
|
log('setting snapd_refresh timer to: {}'.format(timer))
|
|
snap.set_refresh_timer(timer)
|
|
|
|
|
|
@when('tls_client.ca.saved', 'tls_client.server.key.saved',
|
|
'tls_client.server.certificate.saved',
|
|
'tls_client.client.certificate.saved')
|
|
@when_not('etcd.ssl.placed')
|
|
def tls_state_control():
|
|
''' This state represents all the complexity of handling the TLS certs.
|
|
instead of stacking decorators, this state condenses it into a single
|
|
state we can gate on before progressing with secure setup. Also handles
|
|
ensuring users of the system can access the TLS certificates'''
|
|
|
|
bag = EtcdDatabag()
|
|
if not os.path.isdir(bag.etcd_conf_dir):
|
|
hookenv.log('Waiting for etcd conf creation.')
|
|
return
|
|
cmd = ['chown', '-R', 'root:ubuntu', bag.etcd_conf_dir]
|
|
check_call(cmd)
|
|
set_state('etcd.ssl.placed')
|
|
|
|
|
|
@when('etcd.ssl.placed')
|
|
@when_any('tls_client.ca.written',
|
|
'tls_client.server.certificate.written',
|
|
'tls_client.client.certificate.written')
|
|
@when_not('upgrade.series.in-progress')
|
|
def tls_update():
|
|
''' Handle changes to the TLS data by ensuring that the service is
|
|
restarted.
|
|
'''
|
|
# ensure config is updated with new certs and service restarted
|
|
bag = EtcdDatabag()
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
# ensure that certs are re-echoed to the db relations
|
|
remove_state('etcd.ssl.placed')
|
|
remove_state('tls_client.ca.written')
|
|
remove_state('tls_client.server.certificate.written')
|
|
remove_state('tls_client.client.certificate.written')
|
|
|
|
|
|
@when('snap.installed.etcd')
|
|
@when_not('etcd.ssl.exported')
|
|
def render_default_user_ssl_exports():
|
|
''' Add secure credentials to default user environment configs,
|
|
transparently adding TLS '''
|
|
opts = layer.options('tls-client')
|
|
|
|
ca_path = opts['ca_certificate_path']
|
|
client_crt = opts['client_certificate_path']
|
|
client_key = opts['client_key_path']
|
|
|
|
etcd_ver = etcd_version()
|
|
if etcd_ver == 'n/a':
|
|
hookenv.log('Unable to determine version format for etcd SSL config',
|
|
level=hookenv.ERROR)
|
|
return
|
|
major, minor, _ = etcd_ver.split('.')
|
|
|
|
if int(major) >= 3 and int(minor) >= 3:
|
|
evars = [
|
|
'export ETCDCTL_KEY={}\n'.format(client_key),
|
|
'export ETCDCTL_CERT={}\n'.format(client_crt),
|
|
'export ETCDCTL_CACERT={}\n'.format(ca_path)
|
|
]
|
|
else:
|
|
evars = [
|
|
'export ETCDCTL_KEY_FILE={}\n'.format(client_key),
|
|
'export ETCDCTL_CERT_FILE={}\n'.format(client_crt),
|
|
'export ETCDCTL_CA_FILE={}\n'.format(ca_path)
|
|
]
|
|
|
|
with open('/home/ubuntu/.bash_aliases', 'w') as fp:
|
|
fp.writelines(evars)
|
|
with open('/root/.bash_aliases', 'w') as fp:
|
|
fp.writelines(evars)
|
|
|
|
set_state('etcd.ssl.exported')
|
|
|
|
|
|
def force_rejoin():
|
|
"""Wipe local data and rejoin new cluster formed by leader unit
|
|
|
|
This action is required if leader unit performed snapshot restore. All
|
|
other members must remove their local data and previous cluster
|
|
identities and join newly formed, restored, cluster.
|
|
"""
|
|
log('Wiping local storage and rejoining cluster')
|
|
conf = EtcdDatabag()
|
|
host.service_stop(conf.etcd_daemon)
|
|
clear_flag('etcd.registered')
|
|
etcd_data = os.path.join(conf.storage_path(), 'member')
|
|
if os.path.exists(etcd_data):
|
|
shutil.rmtree(etcd_data)
|
|
for _ in range(11):
|
|
# We need randomized back-off timer because only one unit can be
|
|
# joining at the same time
|
|
time.sleep(random.randint(1, 10))
|
|
register_node_with_leader(None)
|
|
if is_flag_set('etcd.registered'):
|
|
log('Successfully rejoined the cluster')
|
|
break
|
|
|
|
|
|
@when('leadership.changed.force_rejoin')
|
|
@when_not('leadership.is_leader')
|
|
def force_rejoin_requested():
|
|
force_rejoin()
|
|
check_cluster_health()
|
|
|
|
|
|
@hook('cluster-relation-broken')
|
|
def perform_self_unregistration(cluster=None):
|
|
''' Attempt self removal during unit teardown. '''
|
|
etcdctl = EtcdCtl()
|
|
leader_address = leader_get('leader_address')
|
|
unit_name = os.getenv('JUJU_UNIT_NAME').replace('/', '')
|
|
members = etcdctl.member_list()
|
|
# Self Unregistration
|
|
etcdctl.unregister(members[unit_name]['unit_id'], leader_address)
|
|
|
|
|
|
@hook('data-storage-attached')
|
|
def format_and_mount_storage():
|
|
''' This allows users to request persistent volumes from the cloud provider
|
|
for the purposes of disaster recovery. '''
|
|
set_state('data.volume.attached')
|
|
# Query juju for the information about the block storage
|
|
device_info = storage_get()
|
|
block = device_info['location']
|
|
bag = EtcdDatabag()
|
|
bag.cluster = leader_get('cluster')
|
|
# the databag has behavior that keeps the path updated.
|
|
# Reference the default path from layer_options.
|
|
etcd_opts = layer.options('etcd')
|
|
# Split the tail of the path to mount the volume 1 level before
|
|
# the data directory.
|
|
tail = os.path.split(bag.etcd_data_dir)[0]
|
|
|
|
if volume_is_mounted(block):
|
|
hookenv.log('Device is already attached to the system.')
|
|
hookenv.log('Refusing to take action against {}'.format(block))
|
|
return
|
|
|
|
# Format the device in non-interactive mode
|
|
cmd = ['mkfs.ext4', device_info['location'], '-F']
|
|
hookenv.log('Creating filesystem on {}'.format(device_info['location']))
|
|
hookenv.log('With command: {}'.format(' '.join(cmd)))
|
|
check_call(cmd)
|
|
|
|
# halt etcd to perform the data-store migration
|
|
host.service_stop(bag.etcd_daemon)
|
|
|
|
os.makedirs(tail, exist_ok=True)
|
|
mount_volume(block, tail)
|
|
# handle first run during early-attach storage, pre-config-changed hook.
|
|
os.makedirs(bag.etcd_data_dir, exist_ok=True)
|
|
|
|
# Only attempt migration if directory exists
|
|
if os.path.isdir(etcd_opts['etcd_data_dir']):
|
|
migrate_path = "{}/".format(etcd_opts['etcd_data_dir'])
|
|
output_path = "{}/".format(bag.etcd_data_dir)
|
|
cmd = ['rsync', '-azp', migrate_path, output_path]
|
|
|
|
hookenv.log('Detected existing data, migrating to new location.')
|
|
hookenv.log('With command: {}'.format(' '.join(cmd)))
|
|
|
|
check_call(cmd)
|
|
|
|
with open('/etc/fstab', 'r') as fp:
|
|
contents = fp.readlines()
|
|
|
|
found = 0
|
|
# scan fstab for the device
|
|
for line in contents:
|
|
if block in line:
|
|
found = found + 1
|
|
|
|
# if device not in fstab, append so it persists through reboots
|
|
if not found > 0:
|
|
append = "{0} {1} ext4 defaults 0 0".format(block, tail) # noqa
|
|
with open('/etc/fstab', 'a') as fp:
|
|
fp.writelines([append])
|
|
|
|
# Finally re-render the configuration and resume operation
|
|
render_config(bag)
|
|
host.service_restart(bag.etcd_daemon)
|
|
|
|
|
|
def read_tls_cert(cert):
|
|
''' Reads the contents of the layer-configured certificate path indicated
|
|
by cert. Returns the utf-8 decoded contents of the file '''
|
|
# Load the layer options for configured paths
|
|
opts = layer.options('tls-client')
|
|
|
|
# Retain a dict of the certificate paths
|
|
cert_paths = {'ca.crt': opts['ca_certificate_path'],
|
|
'server.crt': opts['server_certificate_path'],
|
|
'server.key': opts['server_key_path'],
|
|
'client.crt': opts['client_certificate_path'],
|
|
'client.key': opts['client_key_path']}
|
|
|
|
# If requesting a cert we dont know about, raise a ValueError
|
|
if cert not in cert_paths.keys():
|
|
raise ValueError('No known certificate {}'.format(cert))
|
|
|
|
# Read the contents of the cert and return it in utf-8 encoded text
|
|
with open(cert_paths[cert], 'r') as fp:
|
|
data = fp.read()
|
|
return data
|
|
|
|
|
|
@when('nrpe-external-master.available')
|
|
@when_not('nrpe-external-master.initial-config')
|
|
def initial_nrpe_config(nagios=None):
|
|
set_state('nrpe-external-master.initial-config')
|
|
update_nrpe_config(nagios)
|
|
|
|
|
|
@when_any('config.changed.nagios_context',
|
|
'config.changed.nagios_servicegroups')
|
|
def force_update_nrpe_config():
|
|
remove_state('etcd.nrpe.configured')
|
|
|
|
|
|
@when('etcd.installed')
|
|
@when('nrpe-external-master.available')
|
|
@when_not('etcd.nrpe.configured')
|
|
def update_nrpe_config(unused=None):
|
|
# List of systemd services that will be checked
|
|
services = ('snap.etcd.etcd',)
|
|
|
|
# The current nrpe-external-master interface doesn't handle a lot of logic,
|
|
# use the charm-helpers code for now.
|
|
hostname = nrpe.get_nagios_hostname()
|
|
current_unit = nrpe.get_nagios_unit_name()
|
|
nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
|
|
# add our first check, to alert on service failure
|
|
nrpe.add_init_service_checks(nrpe_setup, services, current_unit)
|
|
|
|
# add the cron job to populate the cache for our second check
|
|
# (we cache the output of 'etcdctl alarm list' to minimise overhead)
|
|
with open("templates/check_etcd-alarms.cron") as fp:
|
|
write_file(
|
|
path="/etc/cron.d/check_etcd-alarms",
|
|
content=fp.read().encode(),
|
|
owner="root",
|
|
perms=0o644,
|
|
)
|
|
|
|
# create an empty output file for the above
|
|
write_file(
|
|
path="/var/lib/nagios/etcd-alarm-list.txt",
|
|
content="",
|
|
owner="root",
|
|
perms=0o644,
|
|
)
|
|
|
|
# install the NRPE script for the above
|
|
with open("templates/check_etcd-alarms.py") as fp:
|
|
write_file(
|
|
path="/usr/lib/nagios/plugins/check_etcd-alarms.py",
|
|
content=fp.read().encode(),
|
|
owner="root",
|
|
perms=0o755,
|
|
)
|
|
|
|
# define our second check, to alert on etcd alarm status
|
|
nrpe_setup.add_check(
|
|
"etcd-alarms",
|
|
"Verify etcd has no raised alarms",
|
|
"/usr/lib/nagios/plugins/check_etcd-alarms.py",
|
|
)
|
|
|
|
nrpe_setup.write()
|
|
set_state('etcd.nrpe.configured')
|
|
|
|
|
|
@when_not('nrpe-external-master.available')
|
|
@when('nrpe-external-master.initial-config')
|
|
def remove_nrpe_config(nagios=None):
|
|
remove_state('nrpe-external-master.initial-config')
|
|
|
|
# List of systemd services for which the checks will be removed
|
|
services = ('snap.etcd.etcd',)
|
|
|
|
# The current nrpe-external-master interface doesn't handle a lot of logic,
|
|
# use the charm-helpers code for now.
|
|
hostname = nrpe.get_nagios_hostname()
|
|
nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
|
|
|
|
for service in services:
|
|
nrpe_setup.remove_check(shortname=service)
|
|
|
|
|
|
def volume_is_mounted(volume):
|
|
''' Takes a hardware path and returns true/false if it is mounted '''
|
|
cmd = ['df', '-t', 'ext4']
|
|
out = check_output(cmd).decode('utf-8')
|
|
return volume in out
|
|
|
|
|
|
def mount_volume(volume, location):
|
|
''' Takes a device path and mounts it to location '''
|
|
cmd = ['mount', volume, location]
|
|
hookenv.log("Mounting {0} to {1}".format(volume, location))
|
|
check_call(cmd)
|
|
|
|
|
|
def unmount_path(location):
|
|
''' Unmounts a mounted volume at path '''
|
|
cmd = ['umount', location]
|
|
hookenv.log("Unmounting {0}".format(location))
|
|
check_call(cmd)
|
|
|
|
|
|
def close_open_ports():
|
|
''' Close the previous port and open the port from configuration. '''
|
|
configuration = hookenv.config()
|
|
previous_port = configuration.previous('port')
|
|
port = configuration.get('port')
|
|
if previous_port is not None and previous_port != port:
|
|
log('The port changed; closing {0} opening {1}'.format(previous_port,
|
|
port))
|
|
close_port(previous_port)
|
|
open_port(port)
|
|
|
|
|
|
def install(src, tgt):
|
|
''' This method wraps the bash "install" command '''
|
|
return check_call(split('install {} {}'.format(src, tgt)))
|
|
|
|
|
|
def render_config(bag=None):
|
|
''' Render the etcd configuration template for the given version '''
|
|
if not bag:
|
|
bag = EtcdDatabag()
|
|
|
|
move_etcd_data_to_standard_location()
|
|
|
|
v2_conf_path = "{}/etcd.conf".format(bag.etcd_conf_dir)
|
|
v3_conf_path = "{}/etcd.conf.yml".format(bag.etcd_conf_dir)
|
|
|
|
# probe for 2.x compatibility
|
|
if etcd_version().startswith('2.'):
|
|
render('etcd2.conf', v2_conf_path, bag.__dict__, owner='root',
|
|
group='root')
|
|
# default to 3.x template behavior
|
|
else:
|
|
render('etcd3.conf', v3_conf_path, bag.__dict__, owner='root',
|
|
group='root')
|
|
if os.path.exists(v2_conf_path):
|
|
# v3 will fail if the v2 config is left in place
|
|
os.remove(v2_conf_path)
|
|
# Close the previous client port and open the new one.
|
|
close_open_ports()
|
|
remove_state('etcd.rerender-config')
|
|
|
|
|
|
def etcd_version():
|
|
''' This method surfaces the version from etcdctl '''
|
|
raw_output = None
|
|
try:
|
|
# try v3
|
|
raw_output = check_output(
|
|
['/snap/bin/etcd.etcdctl', 'version'],
|
|
env={'ETCDCTL_API': '3'}
|
|
).decode('utf-8').strip()
|
|
if "No help topic for 'version'" in raw_output:
|
|
# handle v2
|
|
raw_output = check_output(
|
|
['/snap/bin/etcd.etcdctl', '--version']
|
|
).decode('utf-8').strip()
|
|
for line in raw_output.splitlines():
|
|
if 'etcdctl version' in line:
|
|
# "etcdctl version: 3.0.17" or "etcdctl version 2.3.8"
|
|
version = line.split()[-1]
|
|
return version
|
|
hookenv.log('Unable to find etcd version: {}'.format(raw_output),
|
|
level=hookenv.ERROR)
|
|
return 'n/a'
|
|
except (ValueError, CalledProcessError):
|
|
hookenv.log('Failed to get etcd version:\n'
|
|
'{}'.format(traceback.format_exc()), level=hookenv.ERROR)
|
|
return 'n/a'
|
|
|
|
|
|
def move_etcd_data_to_standard_location():
|
|
''' Moves etcd data to the standard location if it's not already located
|
|
there. This is necessary when generating new etcd config after etcd has
|
|
been upgraded from version 2.3 to 3.x.
|
|
'''
|
|
bag = EtcdDatabag()
|
|
conf_path = bag.etcd_conf_dir + '/etcd.conf.yml'
|
|
if not os.path.exists(conf_path):
|
|
return
|
|
with open(conf_path) as f:
|
|
conf = yaml.safe_load(f)
|
|
data_dir = conf['data-dir']
|
|
desired_data_dir = bag.etcd_data_dir
|
|
if data_dir != desired_data_dir:
|
|
log('Moving etcd data from %s to %s' % (data_dir, desired_data_dir))
|
|
host.service_stop('snap.etcd.etcd')
|
|
for filename in os.listdir(data_dir):
|
|
os.rename(
|
|
data_dir + '/' + filename,
|
|
desired_data_dir + '/' + filename
|
|
)
|
|
os.rmdir(data_dir)
|
|
conf['data-dir'] = desired_data_dir
|
|
with open(conf_path, 'w') as f:
|
|
yaml.dump(conf, f)
|
|
host.service_start('snap.etcd.etcd')
|