reduce highstate frequency with active push for rules and pillars

- schedule highstate every 2 hours (was 15 minutes); interval lives in
  global:push:highstate_interval_hours so the SOC admin UI can tune it and
  so-salt-minion-check derives its threshold as (interval + 1) * 3600
- add inotify beacon on the manager + master reactor + orch.push_batch that
  writes per-app intent files, with a so-push-drainer schedule on the manager
  that debounces, dedupes, and dispatches a single orchestration
- pillar_push_map.yaml allowlists the apps whose pillar changes trigger an
  immediate targeted state.apply (targets verified against salt/top.sls);
  edits under pillar/minions/ trigger a state.highstate on that one minion
- host-batch every push orchestration (batch: 25%, batch_wait: 15) so rule
  changes don't thundering-herd large fleets
- new global:push:enabled kill-switch tears down the beacon, reactor config,
  and drainer schedule on the next highstate for operators who want to keep
  highstate-only behavior
- set restart_policy: unless-stopped on 23 container states so docker
  recovers crashes without waiting for the next highstate; leave registry
  (always), strelka/backend (on-failure), kratos, and hydra alone with
  inline comments explaining why
This commit is contained in:
Mike Reeves
2026-04-10 15:43:16 -04:00
parent 81afbd32d4
commit a0cf0489d6
42 changed files with 927 additions and 10 deletions
+128
View File
@@ -0,0 +1,128 @@
# One pillar directory can map to multiple (state, tgt) actions.
# tgt is a raw salt compound expression. tgt_type is always "compound".
# Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s).
#
# Notes:
# - `bpf` is a pillar-only dir (no state of its own) consumed by both
# zeek and suricata via macros, so a bpf pillar change re-applies both.
# - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have
# their own pillar dirs AND their own state, so they map 1:1 (or 1:2
# in strelka's case, because of the split init.sls / manager.sls).
# - `data` and `node_data` pillar dirs are intentionally omitted --
# they're pillar-only data consumed by many states; trying to handle
# them generically would amount to a highstate.
#
# The role sets here were verified line-by-line against salt/top.sls. If
# salt/top.sls changes how an app is targeted, update the corresponding
# compound here.
# firewall: the one pillar everyone touches. Applied everywhere intentionally
# because every host's iptables needs to know about every other host in the
# grid. Salt's firewall state is idempotent (file.managed + iptables-restore
# onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is
# unchanged do a file comparison and no-op without touching iptables -- actual
# reload happens only on the hosts whose rules actually changed. Fleetwide
# blast radius is intentional and matches the pre-plan behavior via highstate.
# Adding N sensors in a burst coalesces into one dispatch via the drainer.
firewall:
- state: firewall
tgt: '*'
# bpf is pillar-only (no state); consumed by both zeek and suricata as macros.
# Both states run on sensor_roles + so-import per salt/top.sls.
bpf:
- state: zeek
tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
- state: suricata
tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
# ca is applied universally.
ca:
- state: ca
tgt: '*'
# elastalert: eval, standalone, manager, managerhype, managersearch (NOT import).
elastalert:
- state: elastalert
tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
# elasticsearch: 8 roles.
elasticsearch:
- state: elasticsearch
tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone'
# elasticagent: so-heavynode only.
elasticagent:
- state: elasticagent
tgt: 'G@role:so-heavynode'
# elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid
# is a deploy/enrollment step, not a config reload; leave it to the next highstate.
elasticfleet:
- state: elasticfleet
tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
# healthcheck: eval, sensor, standalone only.
healthcheck:
- state: healthcheck
tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone'
# influxdb: manager_roles exactly.
influxdb:
- state: influxdb
tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
# kafka: standalone, manager, managerhype, managersearch, searchnode, receiver.
kafka:
- state: kafka
tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
# kibana: manager_roles exactly.
kibana:
- state: kibana
tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
# logstash: 8 roles, no eval/import.
logstash:
- state: logstash
tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
# nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop.
nginx:
- state: nginx
tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
# redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver.
# (NOT eval, NOT import, NOT searchnode.)
redis:
- state: redis
tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone'
# soc: manager_roles exactly.
soc:
- state: soc
tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
# strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is
# intentionally NOT fired on pillar changes -- YARA rule and strelka config
# pillar changes are consumed by the sensor-side strelka backend, and re-running
# strelka.manager on managers is both unnecessary and disruptive. strelka.manager
# is left to the 2-hour highstate.
strelka:
- state: strelka
tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone'
# suricata: sensor_roles + so-import (5 roles).
suricata:
- state: suricata
tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
# telegraf: universal.
telegraf:
- state: telegraf
tgt: '*'
# zeek: sensor_roles + so-import (5 roles).
zeek:
- state: zeek
tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+170
View File
@@ -0,0 +1,170 @@
#!py
# Reactor invoked by the inotify beacon on pillar file changes under
# /opt/so/saltstack/local/pillar/.
#
# Two branches:
# A) per-minion override under pillar/minions/<id>.sls or adv_<id>.sls
# -> write an intent that runs state.highstate on just that minion.
# B) shared app pillar (pillar/<app>/...) -> look up <app> in
# pillar_push_map.yaml and write an intent with the entry's actions.
#
# Reactors never dispatch directly. The so-push-drainer schedule picks up
# ready intents, dedupes across pending files, and dispatches orch.push_batch.
# See plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
import fcntl
import json
import logging
import os
import time
import salt.client
import yaml
LOG = logging.getLogger(__name__)
PENDING_DIR = '/opt/so/state/push_pending'
LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
MAX_PATHS = 20
PILLAR_ROOT = '/opt/so/saltstack/local/pillar/'
MINIONS_PREFIX = PILLAR_ROOT + 'minions/'
# The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the
# master, which mounts the default saltstack tree at this path.
PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml'
_PUSH_MAP_CACHE = {'mtime': 0, 'data': None}
def _load_push_map():
try:
st = os.stat(PUSH_MAP_PATH)
except OSError:
LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH)
return {}
if _PUSH_MAP_CACHE['mtime'] != st.st_mtime:
try:
with open(PUSH_MAP_PATH, 'r') as f:
_PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {}
except Exception:
LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH)
_PUSH_MAP_CACHE['data'] = {}
_PUSH_MAP_CACHE['mtime'] = st.st_mtime
return _PUSH_MAP_CACHE['data'] or {}
def _push_enabled():
try:
caller = salt.client.Caller()
return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
except Exception:
LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled')
return True
def _write_intent(key, actions, path):
now = time.time()
try:
os.makedirs(PENDING_DIR, exist_ok=True)
except OSError:
LOG.exception('push_pillar: cannot create %s', PENDING_DIR)
return
intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX)
intent = {}
if os.path.exists(intent_path):
try:
with open(intent_path, 'r') as f:
intent = json.load(f)
except (IOError, ValueError):
intent = {}
intent.setdefault('first_touch', now)
intent['last_touch'] = now
intent['actions'] = actions
paths = intent.get('paths', [])
if path and path not in paths:
paths.append(path)
paths = paths[-MAX_PATHS:]
intent['paths'] = paths
tmp_path = intent_path + '.tmp'
with open(tmp_path, 'w') as f:
json.dump(intent, f)
os.rename(tmp_path, intent_path)
except Exception:
LOG.exception('push_pillar: failed to write intent %s', intent_path)
finally:
try:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
finally:
os.close(lock_fd)
def _minion_id_from_path(path):
# path is e.g. /opt/so/saltstack/local/pillar/minions/sensor1.sls
# or /opt/so/saltstack/local/pillar/minions/adv_sensor1.sls
filename = os.path.basename(path)
if not filename.endswith('.sls'):
return None
stem = filename[:-4]
if stem.startswith('adv_'):
stem = stem[4:]
return stem or None
def _app_from_path(path):
# path is e.g. /opt/so/saltstack/local/pillar/zeek/soc_zeek.sls -> 'zeek'
remainder = path[len(PILLAR_ROOT):]
if '/' not in remainder:
return None
return remainder.split('/', 1)[0] or None
def run():
if not _push_enabled():
LOG.info('push_pillar: push disabled, skipping')
return {}
path = data.get('data', {}).get('path', '') # noqa: F821 -- data provided by reactor
if not path or not path.startswith(PILLAR_ROOT):
LOG.debug('push_pillar: ignoring path outside pillar root: %s', path)
return {}
# Branch A: per-minion override
if path.startswith(MINIONS_PREFIX):
minion_id = _minion_id_from_path(path)
if not minion_id:
LOG.debug('push_pillar: ignoring non-sls path under minions/: %s', path)
return {}
actions = [{'highstate': True, 'tgt': minion_id, 'tgt_type': 'glob'}]
_write_intent('minion_{}'.format(minion_id), actions, path)
LOG.info('push_pillar: per-minion intent updated for %s (path=%s)', minion_id, path)
return {}
# Branch B: shared app pillar -> allowlist lookup
app = _app_from_path(path)
if not app:
LOG.debug('push_pillar: ignoring path with no app segment: %s', path)
return {}
push_map = _load_push_map()
entry = push_map.get(app)
if not entry:
LOG.warning(
'push_pillar: pillar dir "%s" is not in pillar_push_map.yaml; '
'change will be picked up at the next scheduled highstate (path=%s)',
app, path,
)
return {}
actions = list(entry) # copy to avoid mutating the cache
_write_intent('pillar_{}'.format(app), actions, path)
LOG.info('push_pillar: app intent updated for %s (path=%s)', app, path)
return {}
+96
View File
@@ -0,0 +1,96 @@
#!py
# Reactor invoked by the inotify beacon on rule file changes under
# /opt/so/saltstack/local/salt/strelka/rules/compiled/.
#
# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json
# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
# across pending files, and dispatches orch.push_batch. Reactors never dispatch
# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
import fcntl
import json
import logging
import os
import time
import salt.client
LOG = logging.getLogger(__name__)
PENDING_DIR = '/opt/so/state/push_pending'
LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
MAX_PATHS = 20
# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side
# strelka runs on exactly these four roles; so-import gets strelka.manager
# instead, which is not fired on pillar changes.
SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
def _sensor_compound():
return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES)
def _push_enabled():
try:
caller = salt.client.Caller()
return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
except Exception:
LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled')
return True
def _write_intent(key, actions, path):
now = time.time()
try:
os.makedirs(PENDING_DIR, exist_ok=True)
except OSError:
LOG.exception('push_strelka: cannot create %s', PENDING_DIR)
return
intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX)
intent = {}
if os.path.exists(intent_path):
try:
with open(intent_path, 'r') as f:
intent = json.load(f)
except (IOError, ValueError):
intent = {}
intent.setdefault('first_touch', now)
intent['last_touch'] = now
intent['actions'] = actions
paths = intent.get('paths', [])
if path and path not in paths:
paths.append(path)
paths = paths[-MAX_PATHS:]
intent['paths'] = paths
tmp_path = intent_path + '.tmp'
with open(tmp_path, 'w') as f:
json.dump(intent, f)
os.rename(tmp_path, intent_path)
except Exception:
LOG.exception('push_strelka: failed to write intent %s', intent_path)
finally:
try:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
finally:
os.close(lock_fd)
def run():
if not _push_enabled():
LOG.info('push_strelka: push disabled, skipping')
return {}
path = data.get('data', {}).get('path', '') # noqa: F821 -- data provided by reactor
actions = [{'state': 'strelka', 'tgt': _sensor_compound()}]
_write_intent('rules_strelka', actions, path)
LOG.info('push_strelka: intent updated for path=%s', path)
return {}
+95
View File
@@ -0,0 +1,95 @@
#!py
# Reactor invoked by the inotify beacon on rule file changes under
# /opt/so/saltstack/local/salt/suricata/rules/.
#
# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json
# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
# across pending files, and dispatches orch.push_batch. Reactors never dispatch
# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
import fcntl
import json
import logging
import os
import time
import salt.client
LOG = logging.getLogger(__name__)
PENDING_DIR = '/opt/so/state/push_pending'
LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
MAX_PATHS = 20
# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also
# runs on so-import per salt/top.sls, so that role is appended below.
SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
def _sensor_compound_plus_import():
return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import'
def _push_enabled():
try:
caller = salt.client.Caller()
return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
except Exception:
LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled')
return True
def _write_intent(key, actions, path):
now = time.time()
try:
os.makedirs(PENDING_DIR, exist_ok=True)
except OSError:
LOG.exception('push_suricata: cannot create %s', PENDING_DIR)
return
intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
try:
fcntl.flock(lock_fd, fcntl.LOCK_EX)
intent = {}
if os.path.exists(intent_path):
try:
with open(intent_path, 'r') as f:
intent = json.load(f)
except (IOError, ValueError):
intent = {}
intent.setdefault('first_touch', now)
intent['last_touch'] = now
intent['actions'] = actions
paths = intent.get('paths', [])
if path and path not in paths:
paths.append(path)
paths = paths[-MAX_PATHS:]
intent['paths'] = paths
tmp_path = intent_path + '.tmp'
with open(tmp_path, 'w') as f:
json.dump(intent, f)
os.rename(tmp_path, intent_path)
except Exception:
LOG.exception('push_suricata: failed to write intent %s', intent_path)
finally:
try:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
finally:
os.close(lock_fd)
def run():
if not _push_enabled():
LOG.info('push_suricata: push disabled, skipping')
return {}
path = data.get('data', {}).get('path', '') # noqa: F821 -- data provided by reactor
actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}]
_write_intent('rules_suricata', actions, path)
LOG.info('push_suricata: intent updated for path=%s', path)
return {}