From 9580976ba2a1317d1b99ef4091a964248170d8bb Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Mon, 8 Jun 2026 11:05:13 -0400 Subject: [PATCH 1/5] Add manager boot-time grid mine.update oneshot before highstate so-boot-mine-update.service is a manager-only Type=oneshot unit that runs once per boot after salt-master/salt-minion start and before so-boot-highstate.service. It pushes mine.update to all reachable minions so mine-backed pillars (node IPs, ES/Redis/Logstash discovery) are fresh before the boot highstate renders them. The helper waits for the responsive minion set to settle (plateau) rather than for every accepted key to report up, so an intentionally powered-off minion doesn't block the update; MAX_WAIT remains as a backstop. --- salt/manager/tools/sbin/so-boot-mine-update | 42 +++++++++++++++++++ salt/salt/master.sls | 1 + salt/salt/master/boot_mine_update.sls | 29 +++++++++++++ salt/salt/service/so-boot-mine-update.service | 15 +++++++ 4 files changed, 87 insertions(+) create mode 100755 salt/manager/tools/sbin/so-boot-mine-update create mode 100644 salt/salt/master/boot_mine_update.sls create mode 100644 salt/salt/service/so-boot-mine-update.service diff --git a/salt/manager/tools/sbin/so-boot-mine-update b/salt/manager/tools/sbin/so-boot-mine-update new file mode 100755 index 000000000..f497d891f --- /dev/null +++ b/salt/manager/tools/sbin/so-boot-mine-update @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Runs once per boot on managers (via so-boot-mine-update.service), before +# so-boot-highstate.service. Waits for the responsive minion set to settle, then +# pushes mine.update to all minions so mine-backed pillars (node IPs, ES/Redis/ +# Logstash discovery) are fresh before the boot highstate renders them. + +MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180} # hard backstop only +INTERVAL=10 +STABLE_CHECKS=3 # up-count must hold steady this many polls +elapsed=0 +prev=-1 +stable=0 +up=0 + +# Wait for the *reachable* minion set to settle rather than for every accepted +# key to report up: an operator may accept a minion's key and then intentionally +# power off that host, so requiring up >= accepted would never be satisfied and +# we'd always burn the full MAX_WAIT. Once the responsive count stops growing we +# stop waiting and run mine.update against whoever is up. +while [ "$elapsed" -lt "$MAX_WAIT" ]; do + up=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null \ + | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null) + up=${up:-0} + if [ "$up" -gt 0 ] && [ "$up" -eq "$prev" ]; then + stable=$((stable + 1)) + [ "$stable" -ge "$STABLE_CHECKS" ] && break + else + stable=0 + fi + prev=$up + sleep "$INTERVAL" + elapsed=$((elapsed + INTERVAL)) +done + +echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update" +/usr/bin/salt '*' mine.update --out=txt diff --git a/salt/salt/master.sls b/salt/salt/master.sls index 895150cd7..c62bd20f3 100644 --- a/salt/salt/master.sls +++ b/salt/salt/master.sls @@ -14,6 +14,7 @@ include: - salt.minion + - salt.master.boot_mine_update {% if 'vrt' in salt['pillar.get']('features', []) %} - salt.cloud - salt.cloud.reactor_config_hypervisor diff --git a/salt/salt/master/boot_mine_update.sls b/salt/salt/master/boot_mine_update.sls new file mode 100644 index 000000000..9f96c0ddf --- /dev/null +++ b/salt/salt/master/boot_mine_update.sls @@ -0,0 +1,29 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Manages /etc/systemd/system/so-boot-mine-update.service, a manager-only +# Type=oneshot unit that pushes `salt '*' mine.update` once per boot, ordered +# before so-boot-highstate.service so mine-backed pillars (node IPs, ES/Redis/ +# Logstash discovery) are fresh before the boot highstate renders them. + +include: + - systemd.reload + +so_boot_mine_update_unit_file: + file.managed: + - name: /etc/systemd/system/so-boot-mine-update.service + - source: salt://salt/service/so-boot-mine-update.service + - onchanges_in: + - module: systemd_reload + +# Only enable once setup is complete. Until then the gate file is missing and +# the unit's own ConditionPathExists would no-op it anyway. +so_boot_mine_update_service: + service.enabled: + - name: so-boot-mine-update.service + - onlyif: test -e /opt/so/state/setup-complete + - require: + - file: so_boot_mine_update_unit_file + - module: systemd_reload diff --git a/salt/salt/service/so-boot-mine-update.service b/salt/salt/service/so-boot-mine-update.service new file mode 100644 index 000000000..c5c6cdf7b --- /dev/null +++ b/salt/salt/service/so-boot-mine-update.service @@ -0,0 +1,15 @@ +[Unit] +Description=Security Onion boot-time grid mine.update (managers, runs once per boot before highstate) +After=salt-master.service salt-minion.service network-online.target +Wants=network-online.target +Requires=salt-master.service salt-minion.service +Before=so-boot-highstate.service +ConditionPathExists=/opt/so/state/setup-complete + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=/usr/sbin/so-boot-mine-update + +[Install] +WantedBy=multi-user.target From e536ffa36387c63a97e820cf3606e44ae94da228 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Tue, 9 Jun 2026 09:35:24 -0400 Subject: [PATCH 2/5] so-boot-mine-update: render node_data after mine.update before highstate After the boot-time mine.update, have the manager actually render the node_data pillar and log whether it came back populated. node_data: False makes salt/top.sls apply the bootstrap recovery branch instead of the manager's real config, so surfacing this in the journal makes the condition visible before so-boot-highstate runs. Best-effort and non-blocking: always exits 0 so highstate proceeds regardless. --- salt/manager/tools/sbin/so-boot-mine-update | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/salt/manager/tools/sbin/so-boot-mine-update b/salt/manager/tools/sbin/so-boot-mine-update index f497d891f..292b24ecc 100755 --- a/salt/manager/tools/sbin/so-boot-mine-update +++ b/salt/manager/tools/sbin/so-boot-mine-update @@ -40,3 +40,20 @@ done echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update" /usr/bin/salt '*' mine.update --out=txt + +# Best-effort: confirm the manager can render node_data (non-False) now that the +# mine is updated. node_data: False makes salt/top.sls fall back to the bootstrap +# recovery branch instead of the manager's real config, so we surface that in the +# journal here. We never block highstate -- if still empty, the recovery branch +# and later highstates self-heal. +/usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1 +sleep 2 +status=$(/usr/bin/salt-call --out=json pillar.get node_data 2>/dev/null \ + | python3 -c 'import sys,json; d=json.load(sys.stdin).get("local"); print("rendered" if d else "empty")' 2>/dev/null) +status=${status:-empty} +if [ "$status" = "rendered" ]; then + echo "so-boot-mine-update: node_data renders; highstate will apply manager config" +else + echo "so-boot-mine-update: WARNING node_data still empty after mine.update; highstate may hit the bootstrap recovery branch" +fi +exit 0 From 8c306eb37dc04209e896347a3140092bcf2cb340 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Tue, 9 Jun 2026 09:49:19 -0400 Subject: [PATCH 3/5] so-boot-mine-update: log the rendered node_data content Dump the actual rendered node_data pillar (pretty-printed JSON) to the journal instead of just a rendered/empty verdict, so the boot-time render attempt is fully inspectable. Empty renders print false/null and still emit the WARNING. --- salt/manager/tools/sbin/so-boot-mine-update | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/salt/manager/tools/sbin/so-boot-mine-update b/salt/manager/tools/sbin/so-boot-mine-update index 292b24ecc..38dd63191 100755 --- a/salt/manager/tools/sbin/so-boot-mine-update +++ b/salt/manager/tools/sbin/so-boot-mine-update @@ -48,10 +48,11 @@ echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running # and later highstates self-heal. /usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1 sleep 2 -status=$(/usr/bin/salt-call --out=json pillar.get node_data 2>/dev/null \ - | python3 -c 'import sys,json; d=json.load(sys.stdin).get("local"); print("rendered" if d else "empty")' 2>/dev/null) -status=${status:-empty} -if [ "$status" = "rendered" ]; then +rendered=$(/usr/bin/salt-call --out=json pillar.get node_data 2>/dev/null \ + | python3 -c 'import sys,json; d=json.load(sys.stdin).get("local"); print(json.dumps(d, indent=2, sort_keys=True))' 2>/dev/null) +echo "so-boot-mine-update: node_data rendered as:" +echo "${rendered:-null}" +if [ -n "$rendered" ] && [ "$rendered" != "null" ] && [ "$rendered" != "false" ]; then echo "so-boot-mine-update: node_data renders; highstate will apply manager config" else echo "so-boot-mine-update: WARNING node_data still empty after mine.update; highstate may hit the bootstrap recovery branch" From 27c77023255caf5e2eede75f0ad03cd1350d9eeb Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Tue, 9 Jun 2026 10:10:32 -0400 Subject: [PATCH 4/5] so-boot-mine-update: wait for a complete mine before highstate Mine-backed pillars (node_data, elasticsearch:nodes, redis:nodes, logstash:nodes, hypervisor:nodes) include a node only if it returned an IP from the mine, and the configs they build are rebuilt fresh every highstate. After a manager reboot with a flushed mine, the first boot highstate could run before an up node re-reported network.ip_addrs, dropping it from e.g. so-elasticsearch ExtraHosts and forcing a container recreate. After the initial broad mine.update, poll until every currently-up minion actually has network.ip_addrs in the mine, re-pushing mine.update to stragglers, before releasing the boot highstate. Shares the existing MINE_UPDATE_MAX_WAIT backstop so a slow/down node never blocks boot, and still logs the rendered node_data for inspection. --- salt/manager/tools/sbin/so-boot-mine-update | 49 +++++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/salt/manager/tools/sbin/so-boot-mine-update b/salt/manager/tools/sbin/so-boot-mine-update index 38dd63191..85da4866a 100755 --- a/salt/manager/tools/sbin/so-boot-mine-update +++ b/salt/manager/tools/sbin/so-boot-mine-update @@ -6,9 +6,11 @@ # Elastic License 2.0. # Runs once per boot on managers (via so-boot-mine-update.service), before -# so-boot-highstate.service. Waits for the responsive minion set to settle, then -# pushes mine.update to all minions so mine-backed pillars (node IPs, ES/Redis/ -# Logstash discovery) are fresh before the boot highstate renders them. +# so-boot-highstate.service. Waits for the responsive minion set to settle, pushes +# mine.update, then waits until every up minion has actually reported to the mine +# so mine-backed pillars (node IPs, ES/Redis/Logstash discovery) are complete +# before the boot highstate renders them -- otherwise a not-yet-reported node gets +# dropped from those pillars and torn out of the configs they build. MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180} # hard backstop only INTERVAL=10 @@ -41,20 +43,39 @@ done echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update" /usr/bin/salt '*' mine.update --out=txt -# Best-effort: confirm the manager can render node_data (non-False) now that the -# mine is updated. node_data: False makes salt/top.sls fall back to the bootstrap -# recovery branch instead of the manager's real config, so we surface that in the -# journal here. We never block highstate -- if still empty, the recovery branch -# and later highstates self-heal. +# A node that is up but has not yet re-reported network.ip_addrs to the mine is +# silently dropped from mine-backed pillars (elasticsearch:nodes, node_data, ...) +# when highstate recompiles them -- which e.g. removes it from so-elasticsearch +# ExtraHosts and forces a container recreate. After the broad mine.update above, +# wait until every up minion actually has network.ip_addrs in the mine, re-pushing +# mine.update to stragglers, before releasing the boot highstate. Bounded by the +# same MAX_WAIT backstop so a slow/down node never blocks boot indefinitely. +missing="" +while [ "$elapsed" -lt "$MAX_WAIT" ]; do + up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null) + mine_json=$(/usr/bin/salt-run mine.get '*' network.ip_addrs tgt_type=glob --out=json 2>/dev/null) + missing=$(printf '%s' "$up_json" | python3 -c ' +import sys, json +up = set(json.load(sys.stdin) or []) +mine = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v} +print("\n".join(sorted(up - mine))) +' "$mine_json" 2>/dev/null) + if [ -z "$missing" ]; then + echo "so-boot-mine-update: mine complete for all up minions after ${elapsed}s" + break + fi + echo "so-boot-mine-update: mine missing up minion(s): $(echo $missing); re-running mine.update" + for m in $missing; do /usr/bin/salt "$m" mine.update --out=txt; done + sleep "$INTERVAL" + elapsed=$((elapsed + INTERVAL)) +done +[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs" + +# Log what node_data renders so the boot-time pillar state is inspectable. /usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1 sleep 2 rendered=$(/usr/bin/salt-call --out=json pillar.get node_data 2>/dev/null \ - | python3 -c 'import sys,json; d=json.load(sys.stdin).get("local"); print(json.dumps(d, indent=2, sort_keys=True))' 2>/dev/null) + | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null) echo "so-boot-mine-update: node_data rendered as:" echo "${rendered:-null}" -if [ -n "$rendered" ] && [ "$rendered" != "null" ] && [ "$rendered" != "false" ]; then - echo "so-boot-mine-update: node_data renders; highstate will apply manager config" -else - echo "so-boot-mine-update: WARNING node_data still empty after mine.update; highstate may hit the bootstrap recovery branch" -fi exit 0 From f088a27159afea926531729e2a45b399172d160b Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Tue, 9 Jun 2026 13:52:19 -0400 Subject: [PATCH 5/5] so-boot-mine-update: warm master pillar cache before highstate A complete mine is not enough: elasticsearch:nodes, redis:nodes, logstash:nodes (tgt_type=pillar) and hypervisor:nodes (tgt_type=compound) resolve their target against the master's per-minion data cache (grains+pillar in data.p), which is populated only when a minion's pillar is recompiled -- separately from the mine. After a reboot a node can be in the mine (so node_data/glob sees it) yet absent from that cache, so it fails the elasticsearch:enabled:true pillar match and is dropped from elasticsearch:nodes -> so-elasticsearch ExtraHosts -> container recreate. After the mine-completeness wait, run salt '*' saltutil.refresh_pillar wait=True to synchronously cache every up node's pillar (the same lever deploy_newnode.sls uses), then verify with salt-run cache.pillar and retry stragglers, bounded by MINE_UPDATE_MAX_WAIT. Also log elasticsearch:nodes alongside node_data for inspection. --- salt/manager/tools/sbin/so-boot-mine-update | 54 +++++++++++++++++---- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/salt/manager/tools/sbin/so-boot-mine-update b/salt/manager/tools/sbin/so-boot-mine-update index 85da4866a..79cd67844 100755 --- a/salt/manager/tools/sbin/so-boot-mine-update +++ b/salt/manager/tools/sbin/so-boot-mine-update @@ -7,10 +7,12 @@ # Runs once per boot on managers (via so-boot-mine-update.service), before # so-boot-highstate.service. Waits for the responsive minion set to settle, pushes -# mine.update, then waits until every up minion has actually reported to the mine -# so mine-backed pillars (node IPs, ES/Redis/Logstash discovery) are complete -# before the boot highstate renders them -- otherwise a not-yet-reported node gets -# dropped from those pillars and torn out of the configs they build. +# mine.update, waits until every up minion has actually reported to the mine, then +# warms the master's per-minion pillar cache so the mine-backed node pillars (node +# IPs, ES/Redis/Logstash/hypervisor discovery -- some glob- and some pillar/grain- +# targeted) are complete before the boot highstate renders them. Otherwise a node +# that is up but not yet fully reported gets dropped from those pillars and torn +# out of the configs they build (e.g. so-elasticsearch ExtraHosts -> container recreate). MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180} # hard backstop only INTERVAL=10 @@ -71,11 +73,45 @@ print("\n".join(sorted(up - mine))) done [ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs" -# Log what node_data renders so the boot-time pillar state is inspectable. +# The pillar/compound-targeted node pillars (elasticsearch:nodes, redis:nodes, +# logstash:nodes, hypervisor:nodes) resolve their target against the master's +# per-minion data cache (grains+pillar in .../minions//data.p), populated only +# when a minion's pillar is (re)compiled -- separately from the mine. A freshly +# booted node can be in the mine (glob/node_data sees it) yet absent from that +# cache, so it is dropped from those pillars and from the configs they build (e.g. +# so-elasticsearch ExtraHosts). Force a synchronous pillar refresh so the master +# caches every up node's pillar; refresh_pillar wait=True returns only once the +# pillar is recompiled (and thus cached for matching). Retry stragglers <= MAX_WAIT. +echo "so-boot-mine-update: warming master pillar cache for pillar/grain-targeted node pillars" +/usr/bin/salt '*' saltutil.refresh_pillar wait=True --out=txt +missing="" +while [ "$elapsed" -lt "$MAX_WAIT" ]; do + up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null) + cached_json=$(/usr/bin/salt-run cache.pillar tgt='*' --out=json 2>/dev/null) + missing=$(printf '%s' "$up_json" | python3 -c ' +import sys, json +up = set(json.load(sys.stdin) or []) +cached = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v} +print("\n".join(sorted(up - cached))) +' "$cached_json" 2>/dev/null) + if [ -z "$missing" ]; then + echo "so-boot-mine-update: pillar cache warm for all up minions after ${elapsed}s" + break + fi + echo "so-boot-mine-update: pillar not yet cached for: $(echo $missing); refreshing" + for m in $missing; do /usr/bin/salt "$m" saltutil.refresh_pillar wait=True --out=txt; done + sleep "$INTERVAL" + elapsed=$((elapsed + INTERVAL)) +done +[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; pillar not cached for: $(echo $missing); pillar-targeted pillars may drop them" + +# Log what the mine-backed pillars render so the boot-time state is inspectable. /usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1 sleep 2 -rendered=$(/usr/bin/salt-call --out=json pillar.get node_data 2>/dev/null \ - | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null) -echo "so-boot-mine-update: node_data rendered as:" -echo "${rendered:-null}" +for key in node_data elasticsearch:nodes; do + rendered=$(/usr/bin/salt-call --out=json pillar.get "$key" 2>/dev/null \ + | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null) + echo "so-boot-mine-update: ${key} rendered as:" + echo "${rendered:-null}" +done exit 0