Charmed-Kubernetes/etcd/reactive/etcd.py

942 lines
32 KiB
Python

#!/usr/bin/python3
from charms import layer
from charms.layer import snap
from charms.reactive import endpoint_from_flag
from charms.reactive import when
from charms.reactive import when_any
from charms.reactive import when_not
from charms.reactive import is_state
from charms.reactive import set_state
from charms.reactive import is_flag_set
from charms.reactive import clear_flag
from charms.reactive import remove_state
from charms.reactive import hook
from charms.reactive.helpers import data_changed
from charms.templating.jinja2 import render
from charmhelpers.core.hookenv import log
from charmhelpers.core.hookenv import leader_set
from charmhelpers.core.hookenv import leader_get
from charmhelpers.core.hookenv import storage_get
from charmhelpers.core.hookenv import application_version_set
from charmhelpers.core.hookenv import open_port
from charmhelpers.core.hookenv import close_port
from charmhelpers.core.host import write_file
from charmhelpers.core import hookenv
from charmhelpers.core import host
from charmhelpers.contrib.charmsupport import nrpe
from charms.layer import status
from etcdctl import EtcdCtl
from etcdctl import get_connection_string
from etcd_databag import EtcdDatabag
from etcd_lib import get_ingress_address, get_ingress_addresses
from shlex import split
from subprocess import check_call
from subprocess import check_output
from subprocess import CalledProcessError
from shutil import copyfile
import os
import charms.leadership # noqa
import socket
import time
import traceback
import yaml
import shutil
import random
# Layer Note: the @when_not etcd.installed state checks are relating to
# a boundry that was superimposed by the etcd-24 release which added support
# for snaps. Snapped etcd is now the only supported mechanism by this charm.
# References to this state will be wiped sometime within the next 10 releases
# of the charm.
# Override the default nagios shortname regex to allow periods, which we
# need because our bin names contain them (e.g. 'snap.foo.daemon'). The
# default regex in charmhelpers doesn't allow periods, but nagios itself does.
nrpe.Check.shortname_re = r'[\.A-Za-z0-9-_]+$'
def get_target_etcd_channel():
"""
Check whether or not etcd is already installed. i.e. we're
going through an upgrade. If so, leave the etcd version alone,
if we're a new install, we can set the default channel here.
If the user has specified a version, then just return that.
:return: String snap channel
"""
channel = hookenv.config('channel')
if channel == 'auto':
if snap.is_installed('etcd'):
return False
else:
return '3.4/stable'
else:
return channel
@when('etcd.installed')
def snap_upgrade_notice():
status.blocked('Manual migration required. http://bit.ly/2oznAUZ')
@when_any('etcd.registered', 'etcd.leader.configured')
@when_not('etcd.installed')
@when_not('upgrade.series.in-progress')
def check_cluster_health():
''' report on the cluster health every 5 minutes'''
etcdctl = EtcdCtl()
health = etcdctl.cluster_health()
# Determine if the unit is healthy or unhealthy
if 'unhealthy' in health['status']:
unit_health = "UnHealthy"
else:
unit_health = "Healthy"
# Determine units peer count, and surface 0 by default
try:
peers = len(etcdctl.member_list())
except Exception:
unit_health = "Errored"
peers = 0
bp = "{0} with {1} known peer{2}"
status_message = bp.format(unit_health, peers, 's' if peers != 1 else '')
status.active(status_message)
@when('snap.installed.etcd')
@when_not('etcd.installed')
def set_app_version():
''' Surface the etcd application version on juju status '''
# note - the snap doesn't place an etcd alias on disk. This shall infer
# the version from etcdctl, as the snap distributes both in lockstep.
application_version_set(etcd_version())
@when_not('certificates.available')
def missing_relation_notice():
status.blocked('Missing relation to certificate authority.')
@when('certificates.available')
def prepare_tls_certificates(tls):
common_name = hookenv.unit_public_ip()
sans = set()
sans.add(hookenv.unit_public_ip())
sans.update(get_ingress_addresses('db'))
sans.update(get_ingress_addresses('cluster'))
sans.add(socket.gethostname())
# add cluster peers as alt names when present
cluster = endpoint_from_flag('cluster.joined')
if cluster:
for ip in cluster.get_db_ingress_addresses():
sans.add(ip)
sans = sorted(sans)
certificate_name = hookenv.local_unit().replace('/', '_')
tls.request_server_cert(common_name, sans, certificate_name)
@hook('upgrade-charm')
def remove_states():
# stale state cleanup (pre rev6)
remove_state('etcd.tls.secured')
remove_state('etcd.ssl.placed')
remove_state('etcd.ssl.exported')
remove_state('etcd.nrpe.configured')
# force a config re-render in case template changed
set_state('etcd.rerender-config')
@hook('pre-series-upgrade')
def pre_series_upgrade():
bag = EtcdDatabag()
host.service_pause(bag.etcd_daemon)
status.blocked('Series upgrade in progress')
@hook('post-series-upgrade')
def post_series_upgrade():
bag = EtcdDatabag()
host.service_resume(bag.etcd_daemon)
@when('snap.installed.etcd')
@when('leadership.is_leader')
@when_any('config.changed.port', 'config.changed.management_port')
@when_not('etcd.installed')
@when_not('upgrade.series.in-progress')
def leader_config_changed():
''' The leader executes the runtime configuration update for the cluster,
as it is the controlling unit. Will render config, close and open ports and
restart the etcd service.'''
configuration = hookenv.config()
previous_port = configuration.previous('port')
log('Previous port: {0}'.format(previous_port))
previous_mgmt_port = configuration.previous('management_port')
log('Previous management port: {0}'.format(previous_mgmt_port))
if previous_port and previous_mgmt_port:
bag = EtcdDatabag()
etcdctl = EtcdCtl()
members = etcdctl.member_list()
# Iterate over all the members in the list.
for unit_name in members:
# Grab the previous peer url and replace the management port.
peer_urls = members[unit_name]['peer_urls']
log('Previous peer url: {0}'.format(peer_urls))
old_port = ':{0}'.format(previous_mgmt_port)
new_port = ':{0}'.format(configuration.get('management_port'))
url = peer_urls.replace(old_port, new_port)
# Update the member's peer_urls with the new ports.
log(etcdctl.member_update(members[unit_name]['unit_id'], url))
# Render just the leaders configuration with the new values.
render_config()
address = get_ingress_address('cluster')
leader_set({'leader_address':
get_connection_string([address],
bag.management_port)})
host.service_restart(bag.etcd_daemon)
@when('snap.installed.etcd')
@when_not('leadership.is_leader')
@when_any('config.changed.port', 'config.changed.management_port')
@when_not('etcd.installed')
def follower_config_changed():
''' Follower units need to render the configuration file, close and open
ports, and restart the etcd service. '''
set_state('etcd.rerender-config')
@when('snap.installed.etcd')
@when('config.changed.bind_to_all_interfaces')
@when_not('upgrade.series.in-progress')
def bind_to_all_interfaces_changed():
set_state('etcd.rerender-config')
@when('etcd.rerender-config')
@when_not('upgrade.series.in-progress')
def rerender_config():
''' Config must be updated and service restarted '''
bag = EtcdDatabag()
log('Rendering config file for {0}'.format(bag.unit_name))
render_config()
if host.service_running(bag.etcd_daemon):
host.service_restart(bag.etcd_daemon)
set_app_version()
@when('cluster.joined')
def set_db_ingress_address(cluster):
''' Send db ingress address to peers on the cluster relation '''
address = get_ingress_address('db')
cluster.set_db_ingress_address(address)
@when('db.connected')
@when('etcd.ssl.placed')
@when('cluster.joined')
def send_cluster_connection_details(cluster, db):
''' Need to set the cluster connection string and
the client key and certificate on the relation object. '''
cert = read_tls_cert('client.crt')
key = read_tls_cert('client.key')
ca = read_tls_cert('ca.crt')
etcdctl = EtcdCtl()
# Set the key, cert, and ca on the db relation
db.set_client_credentials(key, cert, ca)
port = hookenv.config().get('port')
# Get all the peers participating in the cluster relation.
members = cluster.get_db_ingress_addresses()
# Append our own address to the membership list, because peers dont self
# actualize
address = get_ingress_address('db')
members.append(address)
members.sort()
# Create a connection string with all the members on the configured port.
connection_string = get_connection_string(members, port)
# Set the connection string on the db relation.
db.set_connection_string(connection_string, version=etcdctl.version())
@when('db.connected')
@when('etcd.ssl.placed')
@when_not('cluster.joined')
def send_single_connection_details(db):
''' '''
cert = read_tls_cert('client.crt')
key = read_tls_cert('client.key')
ca = read_tls_cert('ca.crt')
etcdctl = EtcdCtl()
# Set the key and cert on the db relation
db.set_client_credentials(key, cert, ca)
bag = EtcdDatabag()
# Get all the peers participating in the cluster relation.
address = get_ingress_address('db')
members = [address]
# Create a connection string with this member on the configured port.
connection_string = get_connection_string(members, bag.port)
# Set the connection string on the db relation.
db.set_connection_string(connection_string, version=etcdctl.version())
@when('proxy.connected')
@when('etcd.ssl.placed')
@when_any('etcd.leader.configured', 'cluster.joined')
def send_cluster_details(proxy):
''' Sends the peer cluster string to proxy units so they can join and act
on behalf of the cluster. '''
cert = read_tls_cert('client.crt')
key = read_tls_cert('client.key')
ca = read_tls_cert('ca.crt')
proxy.set_client_credentials(key, cert, ca)
# format a list of cluster participants
etcdctl = EtcdCtl()
peers = etcdctl.member_list()
cluster = []
for peer in peers:
thispeer = peers[peer]
# Potential member doing registration. Default to skip
if 'peer_urls' not in thispeer.keys() or not thispeer['peer_urls']:
continue
peer_string = "{}={}".format(thispeer['name'], thispeer['peer_urls'])
cluster.append(peer_string)
proxy.set_cluster_string(','.join(cluster))
@when('config.changed.channel')
def channel_changed():
''' Ensure that the config is updated if the channel changes. '''
set_state('etcd.rerender-config')
@when('config.changed.channel')
@when_not('etcd.installed')
def snap_install():
channel = get_target_etcd_channel()
snap.install('core')
if channel:
snap.install('etcd', channel=channel, classic=False)
remove_state('etcd.ssl.exported')
@when('etcd.ssl.placed')
@when_not('snap.installed.etcd')
def install_etcd():
''' Attempt resource get on the "etcd" and "etcdctl" resources. If no
resources are provided attempt to install from the archive only on the
16.04 (xenial) series. '''
if is_state('etcd.installed'):
msg = 'Manual upgrade required. run-action snap-upgrade.'
status.blocked(msg)
return
status.maintenance('Installing etcd.')
channel = get_target_etcd_channel()
if channel:
snap.install('etcd', channel=channel, classic=False)
@when('snap.installed.etcd')
@when_not('etcd.service-restart.configured')
@when_not('upgrade.series.in-progress')
def add_systemd_restart_always():
template = 'templates/service-always-restart.systemd-latest.conf'
service = 'snap.etcd.etcd'
try:
# Get the systemd version
cmd = ['systemd', '--version']
output = check_output(cmd).decode('UTF-8')
line = output.splitlines()[0]
words = line.split()
assert words[0] == 'systemd'
systemd_version = int(words[1])
# Check for old version (for xenial support)
if systemd_version < 230:
template = 'templates/service-always-restart.systemd-229.conf'
except Exception:
traceback.print_exc()
hookenv.log('Failed to detect systemd version, using latest template',
level='ERROR')
dest_dir = '/etc/systemd/system/{}.service.d'.format(service)
os.makedirs(dest_dir, exist_ok=True)
copyfile(template, '{}/always-restart.conf'.format(dest_dir))
check_call(['systemctl', 'daemon-reload'])
host.service_restart('{}.service'.format(service))
set_state('etcd.service-restart.configured')
@when('snap.installed.etcd')
@when('etcd.ssl.placed')
@when('cluster.joined')
@when_not('leadership.is_leader')
@when_not('etcd.registered')
@when_not('etcd.installed')
@when_not('upgrade.series.in-progress')
def register_node_with_leader(cluster):
'''
Control flow mechanism to perform self registration with the leader.
Before executing self registration, we must adhere to the nature of offline
static turnup rules. If we find a GUID in the member list without peering
information the unit will enter a race condition and must wait for a clean
status output before we can progress to self registration.
'''
etcdctl = EtcdCtl()
bag = EtcdDatabag()
leader_address = leader_get('leader_address')
bag.leader_address = leader_address
try:
# Check if we are already registered. Unregister ourselves if we are so
# we can register from scratch.
peer_url = 'https://%s:%s' % (bag.cluster_address, bag.management_port)
members = etcdctl.member_list(leader_address)
for _, member in members.items():
if member['peer_urls'] == peer_url:
log('Found member that matches our peer URL. Unregistering...')
etcdctl.unregister(member['unit_id'], leader_address)
# Now register.
resp = etcdctl.register(bag.__dict__)
bag.set_cluster(resp['cluster'])
except EtcdCtl.CommandFailed:
log('etcdctl.register failed, will retry')
msg = 'Waiting to retry etcd registration'
status.waiting(msg)
return
render_config(bag)
host.service_restart(bag.etcd_daemon)
open_port(bag.port)
set_state('etcd.registered')
@when('etcd.ssl.placed')
@when('leadership.is_leader')
@when_not('etcd.leader.configured')
@when_not('etcd.installed')
@when_not('upgrade.series.in-progress')
def initialize_new_leader():
''' Create an initial cluster string to bring up a single member cluster of
etcd, and set the leadership data so the followers can join this one. '''
bag = EtcdDatabag()
bag.token = bag.token
bag.set_cluster_state('new')
address = get_ingress_address('cluster')
cluster_connection_string = get_connection_string([address],
bag.management_port)
bag.set_cluster("{}={}".format(bag.unit_name, cluster_connection_string))
render_config(bag)
host.service_restart(bag.etcd_daemon)
# sorry, some hosts need this. The charm races with systemd and wins.
time.sleep(2)
# Check health status before we say we are good
etcdctl = EtcdCtl()
status = etcdctl.cluster_health()
if 'unhealthy' in status:
status.blocked('Cluster not healthy.')
return
# We have a healthy leader, broadcast initial data-points for followers
open_port(bag.port)
leader_connection_string = get_connection_string([address],
bag.port)
leader_set({'leader_address': leader_connection_string,
'cluster': bag.cluster})
# set registered state since if we ever become a follower, we will not need
# to re-register
set_state('etcd.registered')
# finish bootstrap delta and set configured state
set_state('etcd.leader.configured')
@when('snap.installed.etcd')
@when('snap.refresh.set')
@when('leadership.is_leader')
def process_snapd_timer():
''' Set the snapd refresh timer on the leader so all cluster members
(present and future) will refresh near the same time. '''
# Get the current snapd refresh timer; we know layer-snap has set this
# when the 'snap.refresh.set' flag is present.
timer = snap.get(snapname='core', key='refresh.timer').decode('utf-8').strip()
if not timer:
# The core snap timer is empty. This likely means a subordinate timer
# reset ours. Try to set it back to a previously leader-set value,
# falling back to config if needed. Luckily, this should only happen
# during subordinate install, so this should remain stable afterward.
timer = leader_get('snapd_refresh') or hookenv.config('snapd_refresh')
snap.set_refresh_timer(timer)
# Ensure we have the timer known by snapd (it may differ from config).
timer = snap.get(snapname='core', key='refresh.timer').decode('utf-8').strip()
# The first time through, data_changed will be true. Subsequent calls
# should only update leader data if something changed.
if data_changed('etcd_snapd_refresh', timer):
log('setting snapd_refresh timer to: {}'.format(timer))
leader_set({'snapd_refresh': timer})
@when('snap.installed.etcd')
@when('snap.refresh.set')
@when('leadership.changed.snapd_refresh')
@when_not('leadership.is_leader')
def set_snapd_timer():
''' Set the snapd refresh.timer on non-leader cluster members. '''
# NB: This method should only be run when 'snap.refresh.set' is present.
# Layer-snap will always set a core refresh.timer, which may not be the
# same as our leader. Gating with 'snap.refresh.set' ensures layer-snap
# has finished and we are free to set our config to the leader's timer.
timer = leader_get('snapd_refresh') or '' # None will cause error
log('setting snapd_refresh timer to: {}'.format(timer))
snap.set_refresh_timer(timer)
@when('tls_client.ca.saved', 'tls_client.server.key.saved',
'tls_client.server.certificate.saved',
'tls_client.client.certificate.saved')
@when_not('etcd.ssl.placed')
def tls_state_control():
''' This state represents all the complexity of handling the TLS certs.
instead of stacking decorators, this state condenses it into a single
state we can gate on before progressing with secure setup. Also handles
ensuring users of the system can access the TLS certificates'''
bag = EtcdDatabag()
if not os.path.isdir(bag.etcd_conf_dir):
hookenv.log('Waiting for etcd conf creation.')
return
cmd = ['chown', '-R', 'root:ubuntu', bag.etcd_conf_dir]
check_call(cmd)
set_state('etcd.ssl.placed')
@when('etcd.ssl.placed')
@when_any('tls_client.ca.written',
'tls_client.server.certificate.written',
'tls_client.client.certificate.written')
@when_not('upgrade.series.in-progress')
def tls_update():
''' Handle changes to the TLS data by ensuring that the service is
restarted.
'''
# ensure config is updated with new certs and service restarted
bag = EtcdDatabag()
render_config(bag)
host.service_restart(bag.etcd_daemon)
# ensure that certs are re-echoed to the db relations
remove_state('etcd.ssl.placed')
remove_state('tls_client.ca.written')
remove_state('tls_client.server.certificate.written')
remove_state('tls_client.client.certificate.written')
@when('snap.installed.etcd')
@when_not('etcd.ssl.exported')
def render_default_user_ssl_exports():
''' Add secure credentials to default user environment configs,
transparently adding TLS '''
opts = layer.options('tls-client')
ca_path = opts['ca_certificate_path']
client_crt = opts['client_certificate_path']
client_key = opts['client_key_path']
etcd_ver = etcd_version()
if etcd_ver == 'n/a':
hookenv.log('Unable to determine version format for etcd SSL config',
level=hookenv.ERROR)
return
major, minor, _ = etcd_ver.split('.')
if int(major) >= 3 and int(minor) >= 3:
evars = [
'export ETCDCTL_KEY={}\n'.format(client_key),
'export ETCDCTL_CERT={}\n'.format(client_crt),
'export ETCDCTL_CACERT={}\n'.format(ca_path)
]
else:
evars = [
'export ETCDCTL_KEY_FILE={}\n'.format(client_key),
'export ETCDCTL_CERT_FILE={}\n'.format(client_crt),
'export ETCDCTL_CA_FILE={}\n'.format(ca_path)
]
with open('/home/ubuntu/.bash_aliases', 'w') as fp:
fp.writelines(evars)
with open('/root/.bash_aliases', 'w') as fp:
fp.writelines(evars)
set_state('etcd.ssl.exported')
def force_rejoin():
"""Wipe local data and rejoin new cluster formed by leader unit
This action is required if leader unit performed snapshot restore. All
other members must remove their local data and previous cluster
identities and join newly formed, restored, cluster.
"""
log('Wiping local storage and rejoining cluster')
conf = EtcdDatabag()
host.service_stop(conf.etcd_daemon)
clear_flag('etcd.registered')
etcd_data = os.path.join(conf.storage_path(), 'member')
if os.path.exists(etcd_data):
shutil.rmtree(etcd_data)
for _ in range(11):
# We need randomized back-off timer because only one unit can be
# joining at the same time
time.sleep(random.randint(1, 10))
register_node_with_leader(None)
if is_flag_set('etcd.registered'):
log('Successfully rejoined the cluster')
break
@when('leadership.changed.force_rejoin')
@when_not('leadership.is_leader')
def force_rejoin_requested():
force_rejoin()
check_cluster_health()
@hook('cluster-relation-broken')
def perform_self_unregistration(cluster=None):
''' Attempt self removal during unit teardown. '''
etcdctl = EtcdCtl()
leader_address = leader_get('leader_address')
unit_name = os.getenv('JUJU_UNIT_NAME').replace('/', '')
members = etcdctl.member_list()
# Self Unregistration
etcdctl.unregister(members[unit_name]['unit_id'], leader_address)
@hook('data-storage-attached')
def format_and_mount_storage():
''' This allows users to request persistent volumes from the cloud provider
for the purposes of disaster recovery. '''
set_state('data.volume.attached')
# Query juju for the information about the block storage
device_info = storage_get()
block = device_info['location']
bag = EtcdDatabag()
bag.cluster = leader_get('cluster')
# the databag has behavior that keeps the path updated.
# Reference the default path from layer_options.
etcd_opts = layer.options('etcd')
# Split the tail of the path to mount the volume 1 level before
# the data directory.
tail = os.path.split(bag.etcd_data_dir)[0]
if volume_is_mounted(block):
hookenv.log('Device is already attached to the system.')
hookenv.log('Refusing to take action against {}'.format(block))
return
# Format the device in non-interactive mode
cmd = ['mkfs.ext4', device_info['location'], '-F']
hookenv.log('Creating filesystem on {}'.format(device_info['location']))
hookenv.log('With command: {}'.format(' '.join(cmd)))
check_call(cmd)
# halt etcd to perform the data-store migration
host.service_stop(bag.etcd_daemon)
os.makedirs(tail, exist_ok=True)
mount_volume(block, tail)
# handle first run during early-attach storage, pre-config-changed hook.
os.makedirs(bag.etcd_data_dir, exist_ok=True)
# Only attempt migration if directory exists
if os.path.isdir(etcd_opts['etcd_data_dir']):
migrate_path = "{}/".format(etcd_opts['etcd_data_dir'])
output_path = "{}/".format(bag.etcd_data_dir)
cmd = ['rsync', '-azp', migrate_path, output_path]
hookenv.log('Detected existing data, migrating to new location.')
hookenv.log('With command: {}'.format(' '.join(cmd)))
check_call(cmd)
with open('/etc/fstab', 'r') as fp:
contents = fp.readlines()
found = 0
# scan fstab for the device
for line in contents:
if block in line:
found = found + 1
# if device not in fstab, append so it persists through reboots
if not found > 0:
append = "{0} {1} ext4 defaults 0 0".format(block, tail) # noqa
with open('/etc/fstab', 'a') as fp:
fp.writelines([append])
# Finally re-render the configuration and resume operation
render_config(bag)
host.service_restart(bag.etcd_daemon)
def read_tls_cert(cert):
''' Reads the contents of the layer-configured certificate path indicated
by cert. Returns the utf-8 decoded contents of the file '''
# Load the layer options for configured paths
opts = layer.options('tls-client')
# Retain a dict of the certificate paths
cert_paths = {'ca.crt': opts['ca_certificate_path'],
'server.crt': opts['server_certificate_path'],
'server.key': opts['server_key_path'],
'client.crt': opts['client_certificate_path'],
'client.key': opts['client_key_path']}
# If requesting a cert we dont know about, raise a ValueError
if cert not in cert_paths.keys():
raise ValueError('No known certificate {}'.format(cert))
# Read the contents of the cert and return it in utf-8 encoded text
with open(cert_paths[cert], 'r') as fp:
data = fp.read()
return data
@when('nrpe-external-master.available')
@when_not('nrpe-external-master.initial-config')
def initial_nrpe_config(nagios=None):
set_state('nrpe-external-master.initial-config')
update_nrpe_config(nagios)
@when_any('config.changed.nagios_context',
'config.changed.nagios_servicegroups')
def force_update_nrpe_config():
remove_state('etcd.nrpe.configured')
@when('etcd.installed')
@when('nrpe-external-master.available')
@when_not('etcd.nrpe.configured')
def update_nrpe_config(unused=None):
# List of systemd services that will be checked
services = ('snap.etcd.etcd',)
# The current nrpe-external-master interface doesn't handle a lot of logic,
# use the charm-helpers code for now.
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
# add our first check, to alert on service failure
nrpe.add_init_service_checks(nrpe_setup, services, current_unit)
# add the cron job to populate the cache for our second check
# (we cache the output of 'etcdctl alarm list' to minimise overhead)
with open("templates/check_etcd-alarms.cron") as fp:
write_file(
path="/etc/cron.d/check_etcd-alarms",
content=fp.read().encode(),
owner="root",
perms=0o644,
)
# create an empty output file for the above
write_file(
path="/var/lib/nagios/etcd-alarm-list.txt",
content="",
owner="root",
perms=0o644,
)
# install the NRPE script for the above
with open("templates/check_etcd-alarms.py") as fp:
write_file(
path="/usr/lib/nagios/plugins/check_etcd-alarms.py",
content=fp.read().encode(),
owner="root",
perms=0o755,
)
# define our second check, to alert on etcd alarm status
nrpe_setup.add_check(
"etcd-alarms",
"Verify etcd has no raised alarms",
"/usr/lib/nagios/plugins/check_etcd-alarms.py",
)
nrpe_setup.write()
set_state('etcd.nrpe.configured')
@when_not('nrpe-external-master.available')
@when('nrpe-external-master.initial-config')
def remove_nrpe_config(nagios=None):
remove_state('nrpe-external-master.initial-config')
# List of systemd services for which the checks will be removed
services = ('snap.etcd.etcd',)
# The current nrpe-external-master interface doesn't handle a lot of logic,
# use the charm-helpers code for now.
hostname = nrpe.get_nagios_hostname()
nrpe_setup = nrpe.NRPE(hostname=hostname, primary=False)
for service in services:
nrpe_setup.remove_check(shortname=service)
def volume_is_mounted(volume):
''' Takes a hardware path and returns true/false if it is mounted '''
cmd = ['df', '-t', 'ext4']
out = check_output(cmd).decode('utf-8')
return volume in out
def mount_volume(volume, location):
''' Takes a device path and mounts it to location '''
cmd = ['mount', volume, location]
hookenv.log("Mounting {0} to {1}".format(volume, location))
check_call(cmd)
def unmount_path(location):
''' Unmounts a mounted volume at path '''
cmd = ['umount', location]
hookenv.log("Unmounting {0}".format(location))
check_call(cmd)
def close_open_ports():
''' Close the previous port and open the port from configuration. '''
configuration = hookenv.config()
previous_port = configuration.previous('port')
port = configuration.get('port')
if previous_port is not None and previous_port != port:
log('The port changed; closing {0} opening {1}'.format(previous_port,
port))
close_port(previous_port)
open_port(port)
def install(src, tgt):
''' This method wraps the bash "install" command '''
return check_call(split('install {} {}'.format(src, tgt)))
def render_config(bag=None):
''' Render the etcd configuration template for the given version '''
if not bag:
bag = EtcdDatabag()
move_etcd_data_to_standard_location()
v2_conf_path = "{}/etcd.conf".format(bag.etcd_conf_dir)
v3_conf_path = "{}/etcd.conf.yml".format(bag.etcd_conf_dir)
# probe for 2.x compatibility
if etcd_version().startswith('2.'):
render('etcd2.conf', v2_conf_path, bag.__dict__, owner='root',
group='root')
# default to 3.x template behavior
else:
render('etcd3.conf', v3_conf_path, bag.__dict__, owner='root',
group='root')
if os.path.exists(v2_conf_path):
# v3 will fail if the v2 config is left in place
os.remove(v2_conf_path)
# Close the previous client port and open the new one.
close_open_ports()
remove_state('etcd.rerender-config')
def etcd_version():
''' This method surfaces the version from etcdctl '''
raw_output = None
try:
# try v3
raw_output = check_output(
['/snap/bin/etcd.etcdctl', 'version'],
env={'ETCDCTL_API': '3'}
).decode('utf-8').strip()
if "No help topic for 'version'" in raw_output:
# handle v2
raw_output = check_output(
['/snap/bin/etcd.etcdctl', '--version']
).decode('utf-8').strip()
for line in raw_output.splitlines():
if 'etcdctl version' in line:
# "etcdctl version: 3.0.17" or "etcdctl version 2.3.8"
version = line.split()[-1]
return version
hookenv.log('Unable to find etcd version: {}'.format(raw_output),
level=hookenv.ERROR)
return 'n/a'
except (ValueError, CalledProcessError):
hookenv.log('Failed to get etcd version:\n'
'{}'.format(traceback.format_exc()), level=hookenv.ERROR)
return 'n/a'
def move_etcd_data_to_standard_location():
''' Moves etcd data to the standard location if it's not already located
there. This is necessary when generating new etcd config after etcd has
been upgraded from version 2.3 to 3.x.
'''
bag = EtcdDatabag()
conf_path = bag.etcd_conf_dir + '/etcd.conf.yml'
if not os.path.exists(conf_path):
return
with open(conf_path) as f:
conf = yaml.safe_load(f)
data_dir = conf['data-dir']
desired_data_dir = bag.etcd_data_dir
if data_dir != desired_data_dir:
log('Moving etcd data from %s to %s' % (data_dir, desired_data_dir))
host.service_stop('snap.etcd.etcd')
for filename in os.listdir(data_dir):
os.rename(
data_dir + '/' + filename,
desired_data_dir + '/' + filename
)
os.rmdir(data_dir)
conf['data-dir'] = desired_data_dir
with open(conf_path, 'w') as f:
yaml.dump(conf, f)
host.service_start('snap.etcd.etcd')