"""Nrpe helpers module.""" import glob import ipaddress import os import socket import subprocess from charmhelpers.core import hookenv from charmhelpers.core.host import is_container from charmhelpers.core.services import helpers import yaml NETLINKS_ERROR = False class InvalidCustomCheckException(Exception): """Custom exception for Invalid nrpe check.""" pass class Monitors(dict): """List of checks that a remote Nagios can query.""" def __init__(self, version="0.3"): """Build monitors structure.""" self["monitors"] = {"remote": {"nrpe": {}}} self["version"] = version def add_monitors(self, mdict, monitor_label="default"): """Add monitors passed in mdict.""" if not mdict or not mdict.get("monitors"): return for checktype in mdict["monitors"].get("remote", []): check_details = mdict["monitors"]["remote"][checktype] if self["monitors"]["remote"].get(checktype): self["monitors"]["remote"][checktype].update(check_details) else: self["monitors"]["remote"][checktype] = check_details for checktype in mdict["monitors"].get("local", []): check_details = self.convert_local_checks( mdict["monitors"]["local"], monitor_label, ) self["monitors"]["remote"]["nrpe"].update(check_details) def add_nrpe_check(self, check_name, command): """Add nrpe check to remote monitors.""" self["monitors"]["remote"]["nrpe"][check_name] = command def convert_local_checks(self, monitors, monitor_src): """Convert check from local checks to remote nrpe checks. monitors -- monitor dict monitor_src -- Monitor source principal, subordinate or user """ mons = {} for checktype in monitors.keys(): for checkname in monitors[checktype]: try: check_def = NRPECheckCtxt( checktype, monitors[checktype][checkname], monitor_src, ) mons[check_def["cmd_name"]] = {"command": check_def["cmd_name"]} except InvalidCustomCheckException as e: hookenv.log( "Error encountered configuring local check " '"{check}": {err}'.format(check=checkname, err=str(e)), hookenv.ERROR, ) return mons def get_ingress_address(binding, external=False): """Get ingress IP address for a binding. Returns a local IP address for incoming requests to NRPE. :param binding: name of the binding, e.g. 'monitors' :param external: bool, if True return the public address if charm config requests otherwise return the local address which would be used for incoming nrpe requests. """ # using network-get to retrieve the address details if available. hookenv.log("Getting ingress IP address for binding %s" % binding) if hookenv.config("nagios_address_type").lower() == "public" and external: return hookenv.unit_get("public-address") ip_address = None try: network_info = hookenv.network_get(binding) if network_info is not None and "ingress-addresses" in network_info: try: ip_address = network_info["bind-addresses"][0]["addresses"][0][ "address" ] hookenv.log("Using ingress-addresses, found %s" % ip_address) except KeyError: hookenv.log("Using primary-addresses") ip_address = hookenv.network_get_primary_address(binding) except (NotImplementedError, FileNotFoundError) as e: hookenv.log( "Unable to determine inbound IP address for binding {} with {}".format( binding, e ), level=hookenv.ERROR, ) return ip_address class MonitorsRelation(helpers.RelationContext): """Define a monitors relation.""" name = "monitors" interface = "monitors" def __init__(self, *args, **kwargs): """Build superclass and principal relation.""" self.principal_relation = PrincipalRelation() super(MonitorsRelation, self).__init__(*args, **kwargs) def is_ready(self): """Return true if the principal relation is ready.""" return self.principal_relation.is_ready() def get_subordinate_monitors(self): """Return default monitors defined by this charm.""" monitors = Monitors() for check in SubordinateCheckDefinitions()["checks"]: if check["cmd_params"]: monitors.add_nrpe_check(check["cmd_name"], check["cmd_name"]) return monitors def get_user_defined_monitors(self): """Return monitors defined by monitors config option.""" monitors = Monitors() monitors.add_monitors(yaml.safe_load(hookenv.config("monitors")), "user") return monitors def get_principal_monitors(self): """Return monitors passed by relation with principal.""" return self.principal_relation.get_monitors() def get_monitor_dicts(self): """Return all monitor dicts.""" monitor_dicts = { "principal": self.get_principal_monitors(), "subordinate": self.get_subordinate_monitors(), "user": self.get_user_defined_monitors(), } return monitor_dicts def get_monitors(self): """Return monitor dict. All monitors merged together and local monitors converted to remote nrpe checks. """ all_monitors = Monitors() monitors = [ self.get_principal_monitors(), self.get_subordinate_monitors(), self.get_user_defined_monitors(), ] for mon in monitors: all_monitors.add_monitors(mon) return all_monitors def egress_subnets(self, relation_data): """Return egress subnets. This behaves the same as charmhelpers.core.hookenv.egress_subnets(). If it can't determine the egress subnets it will fall back to ingress-address or finally private-address. """ if "egress-subnets" in relation_data: return relation_data["egress-subnets"] if "ingress-address" in relation_data: return relation_data["ingress-address"] return relation_data["private-address"] def get_data(self): """Get relation data.""" super(MonitorsRelation, self).get_data() if not hookenv.relation_ids(self.name): return # self['monitors'] comes from the superclass helpers.RelationContext # and contains relation data for each 'monitors' relation (to/from # Nagios). subnets = [self.egress_subnets(info) for info in self["monitors"]] self["monitor_allowed_hosts"] = ",".join(subnets) def provide_data(self): """Return relation info.""" # get the address to send to Nagios for host definition address = get_ingress_address("monitors", external=True) relation_info = { "target-id": self.principal_relation.nagios_hostname(), "monitors": self.get_monitors(), "private-address": address, "ingress-address": address, "target-address": address, "machine_id": os.environ["JUJU_MACHINE_ID"], "model_id": hookenv.model_uuid(), } return relation_info class PrincipalRelation(helpers.RelationContext): """Define a principal relation.""" def __init__(self, *args, **kwargs): """Set name and interface.""" if hookenv.relations_of_type("nrpe-external-master"): self.name = "nrpe-external-master" self.interface = "nrpe-external-master" elif hookenv.relations_of_type("general-info"): self.name = "general-info" self.interface = "juju-info" elif hookenv.relations_of_type("local-monitors"): self.name = "local-monitors" self.interface = "local-monitors" super(PrincipalRelation, self).__init__(*args, **kwargs) def is_ready(self): """Return true if the relation is connected.""" if self.name not in self: return False return "__unit__" in self[self.name][0] def nagios_hostname(self): """Return the string that nagios will use to identify this host.""" host_context = hookenv.config("nagios_host_context") if host_context: host_context += "-" hostname_type = hookenv.config("nagios_hostname_type") # Detect bare metal hosts if hostname_type == "auto": is_metal = "none" in subprocess.getoutput("/usr/bin/systemd-detect-virt") if is_metal: hostname_type = "host" else: hostname_type = "unit" if hostname_type == "host" or not self.is_ready(): nagios_hostname = "{}{}".format(host_context, socket.gethostname()) return nagios_hostname else: principal_unitname = hookenv.principal_unit() # Fallback to using "primary" if it exists. if not principal_unitname: for relunit in self[self.name]: if relunit.get("primary", "False").lower() == "true": principal_unitname = relunit["__unit__"] break nagios_hostname = "{}{}".format(host_context, principal_unitname) nagios_hostname = nagios_hostname.replace("/", "-") return nagios_hostname def get_monitors(self): """Return monitors passed by services on the self.interface relation.""" if not self.is_ready(): return monitors = Monitors() for rel in self[self.name]: if rel.get("monitors"): monitors.add_monitors(yaml.load(rel["monitors"]), "principal") return monitors def provide_data(self): """Return nagios hostname and nagios host context.""" # Provide this data to principals because get_nagios_hostname expects # them in charmhelpers/contrib/charmsupport/nrpe when writing principal # service__* files return { "nagios_hostname": self.nagios_hostname(), "nagios_host_context": hookenv.config("nagios_host_context"), } class NagiosInfo(dict): """Define a NagiosInfo dict.""" def __init__(self): """Set principal relation and dict values.""" self.principal_relation = PrincipalRelation() self["external_nagios_master"] = "127.0.0.1" if hookenv.config("nagios_master") != "None": self["external_nagios_master"] = "{},{}".format( self["external_nagios_master"], hookenv.config("nagios_master") ) self["nagios_hostname"] = self.principal_relation.nagios_hostname() # export_host.cfg.tmpl host definition for Nagios self["nagios_ipaddress"] = get_ingress_address("monitors", external=True) # Address configured for NRPE to listen on self["nrpe_ipaddress"] = get_ingress_address("monitors") self["dont_blame_nrpe"] = "1" if hookenv.config("dont_blame_nrpe") else "0" self["debug"] = "1" if hookenv.config("debug") else "0" class RsyncEnabled(helpers.RelationContext): """Define a relation context for rsync enabled relation.""" def __init__(self): """Set export_nagios_definitions.""" self["export_nagios_definitions"] = hookenv.config("export_nagios_definitions") if ( hookenv.config("nagios_master") and hookenv.config("nagios_master") != "None" ): self["export_nagios_definitions"] = True def is_ready(self): """Return true if relation is ready.""" return self["export_nagios_definitions"] class NRPECheckCtxt(dict): """Convert a local monitor definition. Create a dict needed for writing the nrpe check definition. """ def __init__(self, checktype, check_opts, monitor_src): """Set dict values.""" plugin_path = "/usr/lib/nagios/plugins" if checktype == "procrunning": self["cmd_exec"] = plugin_path + "/check_procs" self["description"] = "Check process {executable} is running".format( **check_opts ) self["cmd_name"] = "check_proc_" + check_opts["executable"] self["cmd_params"] = "-w {min} -c {max} -C {executable}".format( **check_opts ) elif checktype == "processcount": self["cmd_exec"] = plugin_path + "/check_procs" self["description"] = "Check process count" self["cmd_name"] = "check_proc_principal" if "min" in check_opts: self["cmd_params"] = "-w {min} -c {max}".format(**check_opts) else: self["cmd_params"] = "-c {max}".format(**check_opts) elif checktype == "disk": self["cmd_exec"] = plugin_path + "/check_disk" self["description"] = "Check disk usage " + check_opts["path"].replace( "/", "_" ) self["cmd_name"] = "check_disk_principal" self["cmd_params"] = "-w 20 -c 10 -p " + check_opts["path"] elif checktype == "custom": custom_path = check_opts.get("plugin_path", plugin_path) if not custom_path.startswith(os.path.sep): custom_path = os.path.join(os.path.sep, custom_path) if not os.path.isdir(custom_path): raise InvalidCustomCheckException( 'Specified plugin_path "{}" does not exist or is not a ' "directory.".format(custom_path) ) check = check_opts["check"] self["cmd_exec"] = os.path.join(custom_path, check) self["description"] = check_opts.get("desc", "Check %s" % check) self["cmd_name"] = check self["cmd_params"] = check_opts.get("params", "") or "" self["description"] += " ({})".format(monitor_src) self["cmd_name"] += "_" + monitor_src class SubordinateCheckDefinitions(dict): """Return dict of checks the charm configures.""" def __init__(self): """Set dict values.""" self.procs = self.proc_count() load_thresholds = self._get_load_thresholds() proc_thresholds = self._get_proc_thresholds() disk_root_thresholds = self._get_disk_root_thresholds() pkg_plugin_dir = "/usr/lib/nagios/plugins/" local_plugin_dir = "/usr/local/lib/nagios/plugins/" checks = [ { "description": "Number of Zombie processes", "cmd_name": "check_zombie_procs", "cmd_exec": pkg_plugin_dir + "check_procs", "cmd_params": hookenv.config("zombies"), }, { "description": "Number of processes", "cmd_name": "check_total_procs", "cmd_exec": pkg_plugin_dir + "check_procs", "cmd_params": proc_thresholds, }, { "description": "Number of Users", "cmd_name": "check_users", "cmd_exec": pkg_plugin_dir + "check_users", "cmd_params": hookenv.config("users"), }, { "description": "Connnection tracking table", "cmd_name": "check_conntrack", "cmd_exec": local_plugin_dir + "check_conntrack.sh", "cmd_params": hookenv.config("conntrack"), }, ] if not is_container(): checks.extend( [ { "description": "Root disk", "cmd_name": "check_disk_root", "cmd_exec": pkg_plugin_dir + "check_disk", "cmd_params": disk_root_thresholds, }, { "description": "System Load", "cmd_name": "check_load", "cmd_exec": pkg_plugin_dir + "check_load", "cmd_params": load_thresholds, }, { "description": "Swap", "cmd_name": "check_swap", "cmd_exec": pkg_plugin_dir + "check_swap", "cmd_params": hookenv.config("swap").strip(), }, # Note: check_swap_activity *must* be listed after check_swap, else # check_swap_activity will be removed during installation of # check_swap. { "description": "Swap Activity", "cmd_name": "check_swap_activity", "cmd_exec": local_plugin_dir + "check_swap_activity", "cmd_params": hookenv.config("swap_activity"), }, { "description": "Memory", "cmd_name": "check_mem", "cmd_exec": local_plugin_dir + "check_mem.pl", "cmd_params": hookenv.config("mem"), }, { "description": "XFS Errors", "cmd_name": "check_xfs_errors", "cmd_exec": local_plugin_dir + "check_xfs_errors.py", "cmd_params": hookenv.config("xfs_errors"), }, { "description": "ARP cache entries", "cmd_name": "check_arp_cache", "cmd_exec": os.path.join( local_plugin_dir, "check_arp_cache.py" ), "cmd_params": "-w 60 -c 80", }, ] ) ro_filesystem_excludes = hookenv.config("ro_filesystem_excludes") if ro_filesystem_excludes == "": # specify cmd_params = '' to disable/remove the check from nrpe check_ro_filesystem = { "description": "Readonly filesystems", "cmd_name": "check_ro_filesystem", "cmd_exec": os.path.join( local_plugin_dir, "check_ro_filesystem.py" ), "cmd_params": "", } else: check_ro_filesystem = { "description": "Readonly filesystems", "cmd_name": "check_ro_filesystem", "cmd_exec": os.path.join( local_plugin_dir, "check_ro_filesystem.py" ), "cmd_params": "-e {}".format( hookenv.config("ro_filesystem_excludes") ), } checks.append(check_ro_filesystem) if hookenv.config("lacp_bonds").strip(): for bond_iface in hookenv.config("lacp_bonds").strip().split(): if os.path.exists("/sys/class/net/{}".format(bond_iface)): description = "LACP Check {}".format(bond_iface) cmd_name = "check_lacp_{}".format(bond_iface) cmd_exec = local_plugin_dir + "check_lacp_bond.py" cmd_params = "-i {}".format(bond_iface) lacp_check = { "description": description, "cmd_name": cmd_name, "cmd_exec": cmd_exec, "cmd_params": cmd_params, } checks.append(lacp_check) if hookenv.config("netlinks"): ifaces = yaml.safe_load(hookenv.config("netlinks")) cmd_exec = local_plugin_dir + "check_netlinks.py" if hookenv.config("netlinks_skip_unfound_ifaces"): cmd_exec += " --skip-unfound-ifaces" d_ifaces = self.parse_netlinks(ifaces) for iface in d_ifaces: description = "Netlinks status ({})".format(iface) cmd_name = "check_netlinks_{}".format(iface) cmd_params = d_ifaces[iface] netlink_check = { "description": description, "cmd_name": cmd_name, "cmd_exec": cmd_exec, "cmd_params": cmd_params, } checks.append(netlink_check) # Checking if CPU governor is supported by the system and add nrpe check cpu_governor_paths = "/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" cpu_governor_supported = glob.glob(cpu_governor_paths) requested_cpu_governor = hookenv.relation_get("requested_cpu_governor") cpu_governor_config = hookenv.config("cpu_governor") wanted_cpu_governor = cpu_governor_config or requested_cpu_governor if wanted_cpu_governor and cpu_governor_supported: description = "Check CPU governor scaler" cmd_name = "check_cpu_governor" cmd_exec = local_plugin_dir + "check_cpu_governor.py" cmd_params = "--governor {}".format(wanted_cpu_governor) cpu_governor_check = { "description": description, "cmd_name": cmd_name, "cmd_exec": cmd_exec, "cmd_params": cmd_params, } checks.append(cpu_governor_check) self["checks"] = [] sub_postfix = str(hookenv.config("sub_postfix")) # Automatically use _sub for checks shipped on a unit with the nagios # charm. Mostly for backwards compatibility. principal_unit = hookenv.principal_unit() if sub_postfix == "" and principal_unit: md = hookenv._metadata_unit(principal_unit) if md and md.pop("name", None) == "nagios": sub_postfix = "_sub" nrpe_config_sub_tmpl = "/etc/nagios/nrpe.d/{}_*.cfg" nrpe_config_tmpl = "/etc/nagios/nrpe.d/{}.cfg" for check in checks: # This can be used to clean up old files before rendering the new # ones nrpe_configfiles_sub = nrpe_config_sub_tmpl.format(check["cmd_name"]) nrpe_configfiles = nrpe_config_tmpl.format(check["cmd_name"]) check["matching_files"] = glob.glob(nrpe_configfiles_sub) check["matching_files"].extend(glob.glob(nrpe_configfiles)) check["description"] += " (sub)" check["cmd_name"] += sub_postfix self["checks"].append(check) def _get_proc_thresholds(self): """Return suitable processor thresholds.""" if hookenv.config("procs") == "auto": proc_thresholds = "-k -w {} -c {}".format( 25 * self.procs + 100, 50 * self.procs + 100 ) else: proc_thresholds = hookenv.config("procs") return proc_thresholds def _get_load_thresholds(self): """Return suitable load thresholds.""" if hookenv.config("load") == "auto": # Give 1min load alerts higher thresholds than 15 min load alerts warn_multipliers = (4, 2, 1) crit_multipliers = (8, 4, 2) load_thresholds = ("-w %s -c %s") % ( ",".join([str(m * self.procs) for m in warn_multipliers]), ",".join([str(m * self.procs) for m in crit_multipliers]), ) else: load_thresholds = hookenv.config("load") return load_thresholds def _get_disk_root_thresholds(self): """Return suitable disk thresholds.""" if hookenv.config("disk_root"): disk_root_thresholds = hookenv.config("disk_root") + " -p / " else: disk_root_thresholds = "" return disk_root_thresholds def proc_count(self): """Return number number of processing units.""" return int(subprocess.check_output(["nproc", "--all"])) def parse_netlinks(self, ifaces): """Parse a list of strings, or a single string. Looks if the interfaces exist and configures extra parameters (or properties) -> ie. ['mtu:9000', 'speed:1000', 'op:up'] """ iface_path = "/sys/class/net/{}" props_dict = {"mtu": "-m {}", "speed": "-s {}", "op": "-o {}"} if type(ifaces) == str: ifaces = [ifaces] d_ifaces = {} for iface in ifaces: iface_props = iface.strip().split() # no ifaces defined; SKIP if len(iface_props) == 0: continue target = iface_props[0] try: matches = match_cidr_to_ifaces(target) except Exception as e: # Log likely unintentional errors and set flag for blocked status, # if appropriate. if isinstance(e, ValueError) and "has host bits set" in e.args[0]: hookenv.log( "Error parsing netlinks: {}".format(e.args[0]), level=hookenv.ERROR, ) set_netlinks_error() # Treat target as explicit interface name matches = [target] iface_devs = [ target for target in matches if os.path.exists(iface_path.format(target)) ] # no ifaces found; SKIP if not iface_devs: continue # parse extra parameters (properties) del iface_props[0] extra_params = "" for prop in iface_props: # wrong format (key:value); SKIP if prop.find(":") < 0: continue # only one ':' expected kv = prop.split(":") if len(kv) == 2 and kv[0].lower() in props_dict: extra_params += " " extra_params += props_dict[kv[0].lower()].format(kv[1]) for iface_dev in iface_devs: d_ifaces[iface_dev] = "-i {}{}".format(iface_dev, extra_params) return d_ifaces def match_cidr_to_ifaces(cidr): """Use CIDR expression to search for matching network adapters. Returns a list of adapter names. """ import netifaces # Avoid import error before this dependency gets installed network = ipaddress.IPv4Network(cidr) matches = [] for adapter in netifaces.interfaces(): ipv4_addr_structs = netifaces.ifaddresses(adapter).get(netifaces.AF_INET, []) addrs = [ ipaddress.IPv4Address(addr_struct["addr"]) for addr_struct in ipv4_addr_structs ] if any(addr in network for addr in addrs): matches.append(adapter) return matches def has_netlinks_error(): """Return True in case of netlinks related errors.""" return NETLINKS_ERROR def set_netlinks_error(): """Set the flag indicating a netlinks related error.""" global NETLINKS_ERROR NETLINKS_ERROR = True