diff --git a/salt/manager/tools/sbin/so-boot-mine-update b/salt/manager/tools/sbin/so-boot-mine-update new file mode 100755 index 000000000..79cd67844 --- /dev/null +++ b/salt/manager/tools/sbin/so-boot-mine-update @@ -0,0 +1,117 @@ +#!/bin/bash +# +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Runs once per boot on managers (via so-boot-mine-update.service), before +# so-boot-highstate.service. Waits for the responsive minion set to settle, pushes +# mine.update, waits until every up minion has actually reported to the mine, then +# warms the master's per-minion pillar cache so the mine-backed node pillars (node +# IPs, ES/Redis/Logstash/hypervisor discovery -- some glob- and some pillar/grain- +# targeted) are complete before the boot highstate renders them. Otherwise a node +# that is up but not yet fully reported gets dropped from those pillars and torn +# out of the configs they build (e.g. so-elasticsearch ExtraHosts -> container recreate). + +MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180} # hard backstop only +INTERVAL=10 +STABLE_CHECKS=3 # up-count must hold steady this many polls +elapsed=0 +prev=-1 +stable=0 +up=0 + +# Wait for the *reachable* minion set to settle rather than for every accepted +# key to report up: an operator may accept a minion's key and then intentionally +# power off that host, so requiring up >= accepted would never be satisfied and +# we'd always burn the full MAX_WAIT. Once the responsive count stops growing we +# stop waiting and run mine.update against whoever is up. +while [ "$elapsed" -lt "$MAX_WAIT" ]; do + up=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null \ + | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null) + up=${up:-0} + if [ "$up" -gt 0 ] && [ "$up" -eq "$prev" ]; then + stable=$((stable + 1)) + [ "$stable" -ge "$STABLE_CHECKS" ] && break + else + stable=0 + fi + prev=$up + sleep "$INTERVAL" + elapsed=$((elapsed + INTERVAL)) +done + +echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update" +/usr/bin/salt '*' mine.update --out=txt + +# A node that is up but has not yet re-reported network.ip_addrs to the mine is +# silently dropped from mine-backed pillars (elasticsearch:nodes, node_data, ...) +# when highstate recompiles them -- which e.g. removes it from so-elasticsearch +# ExtraHosts and forces a container recreate. After the broad mine.update above, +# wait until every up minion actually has network.ip_addrs in the mine, re-pushing +# mine.update to stragglers, before releasing the boot highstate. Bounded by the +# same MAX_WAIT backstop so a slow/down node never blocks boot indefinitely. +missing="" +while [ "$elapsed" -lt "$MAX_WAIT" ]; do + up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null) + mine_json=$(/usr/bin/salt-run mine.get '*' network.ip_addrs tgt_type=glob --out=json 2>/dev/null) + missing=$(printf '%s' "$up_json" | python3 -c ' +import sys, json +up = set(json.load(sys.stdin) or []) +mine = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v} +print("\n".join(sorted(up - mine))) +' "$mine_json" 2>/dev/null) + if [ -z "$missing" ]; then + echo "so-boot-mine-update: mine complete for all up minions after ${elapsed}s" + break + fi + echo "so-boot-mine-update: mine missing up minion(s): $(echo $missing); re-running mine.update" + for m in $missing; do /usr/bin/salt "$m" mine.update --out=txt; done + sleep "$INTERVAL" + elapsed=$((elapsed + INTERVAL)) +done +[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs" + +# The pillar/compound-targeted node pillars (elasticsearch:nodes, redis:nodes, +# logstash:nodes, hypervisor:nodes) resolve their target against the master's +# per-minion data cache (grains+pillar in .../minions//data.p), populated only +# when a minion's pillar is (re)compiled -- separately from the mine. A freshly +# booted node can be in the mine (glob/node_data sees it) yet absent from that +# cache, so it is dropped from those pillars and from the configs they build (e.g. +# so-elasticsearch ExtraHosts). Force a synchronous pillar refresh so the master +# caches every up node's pillar; refresh_pillar wait=True returns only once the +# pillar is recompiled (and thus cached for matching). Retry stragglers <= MAX_WAIT. +echo "so-boot-mine-update: warming master pillar cache for pillar/grain-targeted node pillars" +/usr/bin/salt '*' saltutil.refresh_pillar wait=True --out=txt +missing="" +while [ "$elapsed" -lt "$MAX_WAIT" ]; do + up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null) + cached_json=$(/usr/bin/salt-run cache.pillar tgt='*' --out=json 2>/dev/null) + missing=$(printf '%s' "$up_json" | python3 -c ' +import sys, json +up = set(json.load(sys.stdin) or []) +cached = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v} +print("\n".join(sorted(up - cached))) +' "$cached_json" 2>/dev/null) + if [ -z "$missing" ]; then + echo "so-boot-mine-update: pillar cache warm for all up minions after ${elapsed}s" + break + fi + echo "so-boot-mine-update: pillar not yet cached for: $(echo $missing); refreshing" + for m in $missing; do /usr/bin/salt "$m" saltutil.refresh_pillar wait=True --out=txt; done + sleep "$INTERVAL" + elapsed=$((elapsed + INTERVAL)) +done +[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; pillar not cached for: $(echo $missing); pillar-targeted pillars may drop them" + +# Log what the mine-backed pillars render so the boot-time state is inspectable. +/usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1 +sleep 2 +for key in node_data elasticsearch:nodes; do + rendered=$(/usr/bin/salt-call --out=json pillar.get "$key" 2>/dev/null \ + | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null) + echo "so-boot-mine-update: ${key} rendered as:" + echo "${rendered:-null}" +done +exit 0 diff --git a/salt/salt/master.sls b/salt/salt/master.sls index 895150cd7..c62bd20f3 100644 --- a/salt/salt/master.sls +++ b/salt/salt/master.sls @@ -14,6 +14,7 @@ include: - salt.minion + - salt.master.boot_mine_update {% if 'vrt' in salt['pillar.get']('features', []) %} - salt.cloud - salt.cloud.reactor_config_hypervisor diff --git a/salt/salt/master/boot_mine_update.sls b/salt/salt/master/boot_mine_update.sls new file mode 100644 index 000000000..9f96c0ddf --- /dev/null +++ b/salt/salt/master/boot_mine_update.sls @@ -0,0 +1,29 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Manages /etc/systemd/system/so-boot-mine-update.service, a manager-only +# Type=oneshot unit that pushes `salt '*' mine.update` once per boot, ordered +# before so-boot-highstate.service so mine-backed pillars (node IPs, ES/Redis/ +# Logstash discovery) are fresh before the boot highstate renders them. + +include: + - systemd.reload + +so_boot_mine_update_unit_file: + file.managed: + - name: /etc/systemd/system/so-boot-mine-update.service + - source: salt://salt/service/so-boot-mine-update.service + - onchanges_in: + - module: systemd_reload + +# Only enable once setup is complete. Until then the gate file is missing and +# the unit's own ConditionPathExists would no-op it anyway. +so_boot_mine_update_service: + service.enabled: + - name: so-boot-mine-update.service + - onlyif: test -e /opt/so/state/setup-complete + - require: + - file: so_boot_mine_update_unit_file + - module: systemd_reload diff --git a/salt/salt/service/so-boot-mine-update.service b/salt/salt/service/so-boot-mine-update.service new file mode 100644 index 000000000..c5c6cdf7b --- /dev/null +++ b/salt/salt/service/so-boot-mine-update.service @@ -0,0 +1,15 @@ +[Unit] +Description=Security Onion boot-time grid mine.update (managers, runs once per boot before highstate) +After=salt-master.service salt-minion.service network-online.target +Wants=network-online.target +Requires=salt-master.service salt-minion.service +Before=so-boot-highstate.service +ConditionPathExists=/opt/so/state/setup-complete + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/sbin/so-boot-mine-update + +[Install] +WantedBy=multi-user.target