mirror of
https://github.com/Security-Onion-Solutions/securityonion.git
synced 2026-06-12 13:19:22 +02:00
27c7702325
Mine-backed pillars (node_data, elasticsearch:nodes, redis:nodes, logstash:nodes, hypervisor:nodes) include a node only if it returned an IP from the mine, and the configs they build are rebuilt fresh every highstate. After a manager reboot with a flushed mine, the first boot highstate could run before an up node re-reported network.ip_addrs, dropping it from e.g. so-elasticsearch ExtraHosts and forcing a container recreate. After the initial broad mine.update, poll until every currently-up minion actually has network.ip_addrs in the mine, re-pushing mine.update to stragglers, before releasing the boot highstate. Shares the existing MINE_UPDATE_MAX_WAIT backstop so a slow/down node never blocks boot, and still logs the rendered node_data for inspection.
82 lines
3.8 KiB
Bash
Executable File
82 lines
3.8 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
|
|
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
|
|
# https://securityonion.net/license; you may not use this file except in compliance with the
|
|
# Elastic License 2.0.
|
|
|
|
# Runs once per boot on managers (via so-boot-mine-update.service), before
|
|
# so-boot-highstate.service. Waits for the responsive minion set to settle, pushes
|
|
# mine.update, then waits until every up minion has actually reported to the mine
|
|
# so mine-backed pillars (node IPs, ES/Redis/Logstash discovery) are complete
|
|
# before the boot highstate renders them -- otherwise a not-yet-reported node gets
|
|
# dropped from those pillars and torn out of the configs they build.
|
|
|
|
MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180} # hard backstop only
|
|
INTERVAL=10
|
|
STABLE_CHECKS=3 # up-count must hold steady this many polls
|
|
elapsed=0
|
|
prev=-1
|
|
stable=0
|
|
up=0
|
|
|
|
# Wait for the *reachable* minion set to settle rather than for every accepted
|
|
# key to report up: an operator may accept a minion's key and then intentionally
|
|
# power off that host, so requiring up >= accepted would never be satisfied and
|
|
# we'd always burn the full MAX_WAIT. Once the responsive count stops growing we
|
|
# stop waiting and run mine.update against whoever is up.
|
|
while [ "$elapsed" -lt "$MAX_WAIT" ]; do
|
|
up=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null \
|
|
| python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null)
|
|
up=${up:-0}
|
|
if [ "$up" -gt 0 ] && [ "$up" -eq "$prev" ]; then
|
|
stable=$((stable + 1))
|
|
[ "$stable" -ge "$STABLE_CHECKS" ] && break
|
|
else
|
|
stable=0
|
|
fi
|
|
prev=$up
|
|
sleep "$INTERVAL"
|
|
elapsed=$((elapsed + INTERVAL))
|
|
done
|
|
|
|
echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update"
|
|
/usr/bin/salt '*' mine.update --out=txt
|
|
|
|
# A node that is up but has not yet re-reported network.ip_addrs to the mine is
|
|
# silently dropped from mine-backed pillars (elasticsearch:nodes, node_data, ...)
|
|
# when highstate recompiles them -- which e.g. removes it from so-elasticsearch
|
|
# ExtraHosts and forces a container recreate. After the broad mine.update above,
|
|
# wait until every up minion actually has network.ip_addrs in the mine, re-pushing
|
|
# mine.update to stragglers, before releasing the boot highstate. Bounded by the
|
|
# same MAX_WAIT backstop so a slow/down node never blocks boot indefinitely.
|
|
missing=""
|
|
while [ "$elapsed" -lt "$MAX_WAIT" ]; do
|
|
up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
|
|
mine_json=$(/usr/bin/salt-run mine.get '*' network.ip_addrs tgt_type=glob --out=json 2>/dev/null)
|
|
missing=$(printf '%s' "$up_json" | python3 -c '
|
|
import sys, json
|
|
up = set(json.load(sys.stdin) or [])
|
|
mine = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
|
|
print("\n".join(sorted(up - mine)))
|
|
' "$mine_json" 2>/dev/null)
|
|
if [ -z "$missing" ]; then
|
|
echo "so-boot-mine-update: mine complete for all up minions after ${elapsed}s"
|
|
break
|
|
fi
|
|
echo "so-boot-mine-update: mine missing up minion(s): $(echo $missing); re-running mine.update"
|
|
for m in $missing; do /usr/bin/salt "$m" mine.update --out=txt; done
|
|
sleep "$INTERVAL"
|
|
elapsed=$((elapsed + INTERVAL))
|
|
done
|
|
[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs"
|
|
|
|
# Log what node_data renders so the boot-time pillar state is inspectable.
|
|
/usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1
|
|
sleep 2
|
|
rendered=$(/usr/bin/salt-call --out=json pillar.get node_data 2>/dev/null \
|
|
| python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null)
|
|
echo "so-boot-mine-update: node_data rendered as:"
|
|
echo "${rendered:-null}"
|
|
exit 0
|