Merge pull request #15965 from Security-Onion-Solutions/nostartupstates

remove startup states from salt config
2026-07-28 11:43:27 +02:00 · 2026-06-09 16:26:47 -04:00
parent f0712bd780 448668a72e
commit 3ba96da3b7
4 changed files with 162 additions and 0 deletions
@@ -0,0 +1,117 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Runs once per boot on managers (via so-boot-mine-update.service), before
+# so-boot-highstate.service. Waits for the responsive minion set to settle, pushes
+# mine.update, waits until every up minion has actually reported to the mine, then
+# warms the master's per-minion pillar cache so the mine-backed node pillars (node
+# IPs, ES/Redis/Logstash/hypervisor discovery -- some glob- and some pillar/grain-
+# targeted) are complete before the boot highstate renders them. Otherwise a node
+# that is up but not yet fully reported gets dropped from those pillars and torn
+# out of the configs they build (e.g. so-elasticsearch ExtraHosts -> container recreate).
+
+MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180}   # hard backstop only
+INTERVAL=10
+STABLE_CHECKS=3                          # up-count must hold steady this many polls
+elapsed=0
+prev=-1
+stable=0
+up=0
+
+# Wait for the *reachable* minion set to settle rather than for every accepted
+# key to report up: an operator may accept a minion's key and then intentionally
+# power off that host, so requiring up >= accepted would never be satisfied and
+# we'd always burn the full MAX_WAIT. Once the responsive count stops growing we
+# stop waiting and run mine.update against whoever is up.
+while [ "$elapsed" -lt "$MAX_WAIT" ]; do
+  up=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null \
+    | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null)
+  up=${up:-0}
+  if [ "$up" -gt 0 ] && [ "$up" -eq "$prev" ]; then
+    stable=$((stable + 1))
+    [ "$stable" -ge "$STABLE_CHECKS" ] && break
+  else
+    stable=0
+  fi
+  prev=$up
+  sleep "$INTERVAL"
+  elapsed=$((elapsed + INTERVAL))
+done
+
+echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update"
+/usr/bin/salt '*' mine.update --out=txt
+
+# A node that is up but has not yet re-reported network.ip_addrs to the mine is
+# silently dropped from mine-backed pillars (elasticsearch:nodes, node_data, ...)
+# when highstate recompiles them -- which e.g. removes it from so-elasticsearch
+# ExtraHosts and forces a container recreate. After the broad mine.update above,
+# wait until every up minion actually has network.ip_addrs in the mine, re-pushing
+# mine.update to stragglers, before releasing the boot highstate. Bounded by the
+# same MAX_WAIT backstop so a slow/down node never blocks boot indefinitely.
+missing=""
+while [ "$elapsed" -lt "$MAX_WAIT" ]; do
+  up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
+  mine_json=$(/usr/bin/salt-run mine.get '*' network.ip_addrs tgt_type=glob --out=json 2>/dev/null)
+  missing=$(printf '%s' "$up_json" | python3 -c '
+import sys, json
+up = set(json.load(sys.stdin) or [])
+mine = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
+print("\n".join(sorted(up - mine)))
+' "$mine_json" 2>/dev/null)
+  if [ -z "$missing" ]; then
+    echo "so-boot-mine-update: mine complete for all up minions after ${elapsed}s"
+    break
+  fi
+  echo "so-boot-mine-update: mine missing up minion(s): $(echo $missing); re-running mine.update"
+  for m in $missing; do /usr/bin/salt "$m" mine.update --out=txt; done
+  sleep "$INTERVAL"
+  elapsed=$((elapsed + INTERVAL))
+done
+[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs"
+
+# The pillar/compound-targeted node pillars (elasticsearch:nodes, redis:nodes,
+# logstash:nodes, hypervisor:nodes) resolve their target against the master's
+# per-minion data cache (grains+pillar in .../minions/<id>/data.p), populated only
+# when a minion's pillar is (re)compiled -- separately from the mine. A freshly
+# booted node can be in the mine (glob/node_data sees it) yet absent from that
+# cache, so it is dropped from those pillars and from the configs they build (e.g.
+# so-elasticsearch ExtraHosts). Force a synchronous pillar refresh so the master
+# caches every up node's pillar; refresh_pillar wait=True returns only once the
+# pillar is recompiled (and thus cached for matching). Retry stragglers <= MAX_WAIT.
+echo "so-boot-mine-update: warming master pillar cache for pillar/grain-targeted node pillars"
+/usr/bin/salt '*' saltutil.refresh_pillar wait=True --out=txt
+missing=""
+while [ "$elapsed" -lt "$MAX_WAIT" ]; do
+  up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
+  cached_json=$(/usr/bin/salt-run cache.pillar tgt='*' --out=json 2>/dev/null)
+  missing=$(printf '%s' "$up_json" | python3 -c '
+import sys, json
+up = set(json.load(sys.stdin) or [])
+cached = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
+print("\n".join(sorted(up - cached)))
+' "$cached_json" 2>/dev/null)
+  if [ -z "$missing" ]; then
+    echo "so-boot-mine-update: pillar cache warm for all up minions after ${elapsed}s"
+    break
+  fi
+  echo "so-boot-mine-update: pillar not yet cached for: $(echo $missing); refreshing"
+  for m in $missing; do /usr/bin/salt "$m" saltutil.refresh_pillar wait=True --out=txt; done
+  sleep "$INTERVAL"
+  elapsed=$((elapsed + INTERVAL))
+done
+[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; pillar not cached for: $(echo $missing); pillar-targeted pillars may drop them"
+
+# Log what the mine-backed pillars render so the boot-time state is inspectable.
+/usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1
+sleep 2
+for key in node_data elasticsearch:nodes; do
+  rendered=$(/usr/bin/salt-call --out=json pillar.get "$key" 2>/dev/null \
+    | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null)
+  echo "so-boot-mine-update: ${key} rendered as:"
+  echo "${rendered:-null}"
+done
+exit 0
@@ -14,6 +14,7 @@

 include:
  - salt.minion
+  - salt.master.boot_mine_update
 {%   if 'vrt' in salt['pillar.get']('features', []) %}
  - salt.cloud
  - salt.cloud.reactor_config_hypervisor
@@ -0,0 +1,29 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Manages /etc/systemd/system/so-boot-mine-update.service, a manager-only
+# Type=oneshot unit that pushes `salt '*' mine.update` once per boot, ordered
+# before so-boot-highstate.service so mine-backed pillars (node IPs, ES/Redis/
+# Logstash discovery) are fresh before the boot highstate renders them.
+
+include:
+  - systemd.reload
+
+so_boot_mine_update_unit_file:
+  file.managed:
+    - name: /etc/systemd/system/so-boot-mine-update.service
+    - source: salt://salt/service/so-boot-mine-update.service
+    - onchanges_in:
+      - module: systemd_reload
+
+# Only enable once setup is complete. Until then the gate file is missing and
+# the unit's own ConditionPathExists would no-op it anyway.
+so_boot_mine_update_service:
+  service.enabled:
+    - name: so-boot-mine-update.service
+    - onlyif: test -e /opt/so/state/setup-complete
+    - require:
+      - file: so_boot_mine_update_unit_file
+      - module: systemd_reload
@@ -0,0 +1,15 @@
+[Unit]
+Description=Security Onion boot-time grid mine.update (managers, runs once per boot before highstate)
+After=salt-master.service salt-minion.service network-online.target
+Wants=network-online.target
+Requires=salt-master.service salt-minion.service
+Before=so-boot-highstate.service
+ConditionPathExists=/opt/so/state/setup-complete
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/usr/sbin/so-boot-mine-update
+
+[Install]
+WantedBy=multi-user.target