Charmed-Kubernetes/kubernetes-worker/actions/cis-benchmark

#!/usr/local/sbin/charm-env python3
import os
import json
import shlex
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path

import charms.layer
import charms.reactive
from charmhelpers.core import hookenv, unitdata
from charmhelpers.fetch.archiveurl import ArchiveUrlFetchHandler
from charms.layer import snap
from charms.reactive import clear_flag, is_flag_set, set_flag


BENCH_HOME = "/home/ubuntu/kube-bench"
BENCH_BIN = "{}/kube-bench".format(BENCH_HOME)
BENCH_CFG = "{}/cfg-ck".format(BENCH_HOME)
GO_PKG = "github.com/aquasecurity/kube-bench"
RESULTS_DIR = "/home/ubuntu/kube-bench-results"

# Remediation dicts associate a failing test with a tuple to fix it.
# Conservative fixes will probably leave the cluster in a good state.
# Dangerous fixes will likely break the cluster.
# Tuple examples:
#  {'1.2.3': ('manual -- we don't know how to auto fix this', None, None)}
#  {'1.2.3': ('cli', 'command to run', None)}
#  {'1.2.3': ('kv', 'snap', {cfg_key: value})}
CONSERVATIVE = {
    "0.0.0": ("cli", 'echo "this is fine"', None),
    # etcd (no known failures with a default install)
    # k8s-control-plane (no known failures with a default install)
    # k8s-worker (no known failures with a default install)
}
ADMISSION_PLUGINS = {
    "enable-admission-plugins": (
        "PersistentVolumeLabel",
        "PodSecurityPolicy," "AlwaysPullImages",
        "NodeRestriction",
    )
}
DANGEROUS = {
    "0.0.0": ("cli", 'echo "this is fine"', None),
    # etcd (no known warnings with a default install)
    # k8s-control-plane
    "1.1.21": ("cli", "chmod -R 600 /root/cdk/*.key", None),
    "1.2.9": ("manual", None, None),
    "1.2.11": ("kv", "kube-apiserver", ADMISSION_PLUGINS),
    "1.2.25": ("manual", None, None),
    "1.2.33": ("manual", None, None),
    "1.2.34": ("manual", None, None),
    # k8s-worker
    "4.2.9": ("kv", "kubelet", {"event-qps": 0}),
    "4.2.13": (
        "kv",
        "kubelet",
        {
            "tls-cipher-suites": "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,"
            "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,"
            "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,"
            "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,"
            "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,"
            "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,"
            "TLS_RSA_WITH_AES_256_GCM_SHA384,"
            "TLS_RSA_WITH_AES_128_GCM_SHA256"
        },
    ),
}


def _fail(msg):
    """Fail the action with a given message."""
    hookenv.action_fail(msg)
    sys.exit()


def _move_matching_parent(dirpath, filename, dest):
    """Move a parent directory that contains a specific file.

    Helper function that walks a directory looking for a given file. If found,
    the file's parent directory is moved to the given destination.

    :param: dirpath: String path to search
    :param: filename: String file to find
    :param: dest: String destination of the found parent directory
    """
    for root, _, files in os.walk(dirpath):
        for name in files:
            if name == filename:
                hookenv.log("Moving {} to {}".format(root, dest))
                shutil.move(root, dest)
                return
    else:
        _fail("Could not find {} in {}".format(filename, dirpath))


def _restart_charm():
    """Set charm-specific flags and call reactive.main()."""
    app = hookenv.charm_name() or "unknown"
    if "master" in app:
        hookenv.log("Restarting master")
        clear_flag("kubernetes-master.components.started")
        # or this app could have been upgrade to new flags
        clear_flag("kubernetes-control-plane.components.started")
    elif "control-plane" in app:
        hookenv.log("Restarting control-plane")
        clear_flag("kubernetes-control-plane.components.started")
    elif "worker" in app:
        hookenv.log("Restarting worker")
        set_flag("kubernetes-worker.restart-needed")
    elif "etcd" in app:
        hookenv.log("No-op: etcd does not need to be restarted")
        return
    else:
        _fail("Unable to determine the charm to restart: {}".format(app))

    # Invoke reactive so the charm will react to the flags we just managed
    charms.layer.import_layer_libs()
    charms.reactive.main()


def install(release, config):
    """Install kube-bench and related configuration.

    Release and configuration are set via action params. If installing an
    upstream release, this method will also install 'go' if needed.

    :param: release: Archive URI or 'upstream'
    :param: config: Archive URI of configuration files
    """
    if Path(BENCH_HOME).exists():
        shutil.rmtree(BENCH_HOME)
    fetcher = ArchiveUrlFetchHandler()

    if release == "upstream":
        Path(BENCH_HOME).mkdir(parents=True, exist_ok=True)

        # Setup the 'go' environment
        env = os.environ.copy()
        go_bin = shutil.which("go", path="{}:/snap/bin".format(env["PATH"]))
        if not go_bin:
            snap.install("go", channel="stable", classic=True)
            go_bin = "/snap/bin/go"
        go_cache = os.getenv("GOCACHE", "/var/snap/go/common/cache")
        go_path = os.getenv("GOPATH", "/var/snap/go/common")
        env["GOCACHE"] = go_cache
        env["GOPATH"] = go_path
        Path(go_path).mkdir(parents=True, exist_ok=True)

        # From https://github.com/aquasecurity/kube-bench#installing-from-sources
        go_cmd = "{bin} get {pkg} " "github.com/golang/dep/cmd/dep".format(
            bin=go_bin, pkg=GO_PKG
        )
        try:
            subprocess.check_call(shlex.split(go_cmd), cwd=go_path, env=env)
        except subprocess.CalledProcessError:
            _fail("Failed to run: {}".format(go_cmd))

        go_cmd = "{bin} build -o {out} {base}/src/{pkg}".format(
            bin=go_bin, out=BENCH_BIN, base=go_path, pkg=GO_PKG
        )
        try:
            subprocess.check_call(shlex.split(go_cmd), cwd=go_path, env=env)
        except subprocess.CalledProcessError:
            _fail("Failed to run: {}".format(go_cmd))
    else:
        # Fetch the release URI and put it in the right place.
        archive_path = fetcher.install(source=release)
        # NB: We may not know the structure of the archive, but we know the
        # directory containing 'kube-bench' belongs in our BENCH_HOME.
        _move_matching_parent(
            dirpath=archive_path, filename="kube-bench", dest=BENCH_HOME
        )

    # Fetch the config URI and put it in the right place.
    archive_dir = fetcher.install(source=config)
    # NB: We may not know the structure of the archive, but we know the
    # directory containing 'config.yaml' belongs in our BENCH_CFG.
    _move_matching_parent(dirpath=archive_dir, filename="config.yaml", dest=BENCH_CFG)


def apply(remediations=None):
    """Apply remediations to address benchmark failures.

    :param: remediations: either 'conservative' or 'dangerous'
    """
    applied_fixes = 0
    danger = True if remediations == "dangerous" else False
    db = unitdata.kv()

    json_log = report(log_format="json")
    hookenv.log("Loading JSON from: {}".format(json_log))
    try:
        with open(json_log, "r") as f:
            full_json = json.load(f)
    except Exception:
        _fail("Failed to load: {}".format(json_log))

    full_json = full_json.get("Controls")[0] if "Controls" in full_json else full_json
    for test in full_json.get("tests", {}):
        for result in test.get("results", {}):
            test_num = result.get("test_number")
            test_remediation = result.get("remediation")
            test_status = result.get("status", "")

            if test_status.lower() in ("fail", "warn"):
                test_remedy = CONSERVATIVE.get(test_num)
                if not test_remedy and danger:
                    # no conservative remedy, check dangerous if user wants
                    test_remedy = DANGEROUS.get(test_num)
                if isinstance(test_remedy, tuple):
                    if test_remedy[0] == "manual":
                        # we don't know how to autofix; log remediation text
                        hookenv.log(
                            "Test {}: unable to auto-apply remedy.\n"
                            "Manual steps:\n{}".format(test_num, test_remediation)
                        )
                    elif test_remedy[0] == "cli":
                        cmd = shlex.split(test_remedy[1])
                        try:
                            out = subprocess.check_output(cmd)
                        except subprocess.CalledProcessError:
                            _fail("Test {}: failed to run: {}".format(test_num, cmd))
                        else:
                            hookenv.log(
                                "Test {}: applied remedy: {}\n"
                                "Output: {}".format(test_num, cmd, out)
                            )
                            applied_fixes += 1
                    elif test_remedy[0] == "kv":
                        cfg_key = "cis-" + test_remedy[1]
                        cfg = db.get(cfg_key) or {}
                        cfg.update(test_remedy[2])
                        db.set(cfg_key, cfg)

                        hookenv.log(
                            "Test {}: updated configuration: {}\n".format(test_num, cfg)
                        )
                        applied_fixes += 1
                else:
                    hookenv.log("Test {}: remediation is missing".format(test_num))

    # CLI and KV changes will require a charm restart; do it.
    if applied_fixes > 0:
        _restart_charm()

    msg = (
        'Applied {} remediations. Re-run with "apply=none" to generate a ' "new report."
    ).format(applied_fixes)
    hookenv.action_set({"summary": msg})


def reset():
    """Reset any remediations we applied to unitdata.kv().

    This action does not track individual remediations to reset. Therefore,
    this function unconditionally unsets all 'cis-' prefixed arguments that
    this action may have set and restarts the relevant charm.
    """
    db = unitdata.kv()

    db.unset("cis-kube-apiserver")
    db.unset("cis-kube-scheduler")
    db.unset("cis-kube-controller-manager")
    db.unset("cis-kubelet")
    _restart_charm()

    hookenv.action_set(
        {
            "summary": (
                "Reset is complete. Re-run with "
                '"apply=none" to generate a new report.'
            )
        }
    )


def report(log_format="text"):
    """Run kube-bench and report results.

    By default, save the full plain-text results to our RESULTS_DIR and set
    action output with a summary. This function can also save full results in
    a machine-friendly json format.

    :param: log_format: String determines if output is text or json
    :returns: Path to results log
    """
    Path(RESULTS_DIR).mkdir(parents=True, exist_ok=True)

    # Node type is different depending on the charm
    app = hookenv.charm_name() or "unknown"
    version = "cis-1.23"
    if "master" in app:
        target = "master"
    if "control-plane" in app:
        # must refer to this as upstream kube-bench tests do
        # wokeignore:rule=master
        target = "master"
    elif "worker" in app:
        target = "node"
    elif "etcd" in app:
        target = "etcd"
    else:
        _fail("Unable to determine the target to benchmark: {}".format(app))

    # Commands and log names are different depending on the format
    if log_format == "json":
        log_prefix = "results-json-"
        verbose_cmd = (
            "{bin} -D {cfg} --benchmark {ver} --json run " "--targets {target}"
        ).format(bin=BENCH_BIN, cfg=BENCH_CFG, ver=version, target=target)
    else:
        log_prefix = "results-text-"
        verbose_cmd = (
            "{bin} -D {cfg} --benchmark {ver} run " "--targets {target}"
        ).format(bin=BENCH_BIN, cfg=BENCH_CFG, ver=version, target=target)

    summary_cmd = (
        "{bin} -D {cfg} --benchmark {ver} "
        "--noremediations --noresults run --targets {target}"
    ).format(bin=BENCH_BIN, cfg=BENCH_CFG, ver=version, target=target)

    # Store full results for future consumption
    with tempfile.NamedTemporaryFile(
        mode="w+b", prefix=log_prefix, dir=RESULTS_DIR, delete=False
    ) as res_file:
        try:
            subprocess.call(
                shlex.split(verbose_cmd), stdout=res_file, stderr=subprocess.DEVNULL
            )
        except subprocess.CalledProcessError:
            _fail("Failed to run: {}".format(verbose_cmd))
        else:
            # remember the filename for later (and make it readable, why not?)
            Path(res_file.name).chmod(0o644)
            log = res_file.name

    # When making a summary, we also have a verbose report. Set action output
    # so operators can see everything related to this run.
    try:
        out = subprocess.check_output(
            shlex.split(summary_cmd), universal_newlines=True, stderr=subprocess.DEVNULL
        )
    except subprocess.CalledProcessError:
        _fail("Failed to run: {}".format(summary_cmd))
    else:
        fetch_cmd = "juju scp {unit}:{file} .".format(
            unit=hookenv.local_unit(), file=log
        )
        hookenv.action_set({"cmd": summary_cmd, "report": fetch_cmd, "summary": out})

    return log or None


if __name__ == "__main__":
    if not (
        is_flag_set("snap.installed.etcd")
        or is_flag_set("kubernetes-master.snaps.installed")
        or is_flag_set("kubernetes-control-plane.snaps.installed")
        or is_flag_set("kubernetes-worker.snaps.installed")
        or is_flag_set("kubernetes-node.snaps.installed")
    ):
        msg = "Snaps are not yet installed on this unit."
        _fail(msg)

    # Validate action params
    release = hookenv.action_get("release") or "upstream"
    config = hookenv.action_get("config")
    if not config:
        msg = 'Missing "config" parameter'
        _fail(msg)
    remediations = hookenv.action_get("apply")
    if remediations not in ["none", "conservative", "dangerous", "reset"]:
        msg = 'Invalid "apply" parameter: {}'.format(remediations)
        _fail(msg)

    # TODO: may want an option to overwrite an existing install
    if Path(BENCH_BIN).exists() and Path(BENCH_CFG).exists():
        hookenv.log("{} exists; skipping install".format(BENCH_HOME))
    else:
        hookenv.log("Installing benchmark from: {}".format(release))
        install(release, config)

    # Reset, remediate, or report
    if remediations == "reset":
        hookenv.log("Attempting to remove all remediations")
        reset()
    elif remediations != "none":
        hookenv.log('Applying "{}" remediations'.format(remediations))
        apply(remediations)
    else:
        hookenv.log("Report only; no remediations were requested")
        report(log_format="text")