reduce highstate frequency with active push for rules and pillars

- schedule highstate every 2 hours (was 15 minutes); interval lives in global:push:highstate_interval_hours so the SOC admin UI can tune it and so-salt-minion-check derives its threshold as (interval + 1) * 3600 - add inotify beacon on the manager + master reactor + orch.push_batch that writes per-app intent files, with a so-push-drainer schedule on the manager that debounces, dedupes, and dispatches a single orchestration - pillar_push_map.yaml allowlists the apps whose pillar changes trigger an immediate targeted state.apply (targets verified against salt/top.sls); edits under pillar/minions/ trigger a state.highstate on that one minion - host-batch every push orchestration (batch: 25%, batch_wait: 15) so rule changes don't thundering-herd large fleets - new global:push:enabled kill-switch tears down the beacon, reactor config, and drainer schedule on the next highstate for operators who want to keep highstate-only behavior - set restart_policy: unless-stopped on 23 container states so docker recovers crashes without waiting for the next highstate; leave registry (always), strelka/backend (on-failure), kratos, and hydra alone with inline comments explaining why
2026-06-03 08:55:32 +02:00 · 2026-04-10 15:43:16 -04:00
parent 81afbd32d4
commit a0cf0489d6
42 changed files with 927 additions and 10 deletions
@@ -0,0 +1,128 @@
+# One pillar directory can map to multiple (state, tgt) actions.
+# tgt is a raw salt compound expression. tgt_type is always "compound".
+# Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s).
+#
+# Notes:
+#   - `bpf` is a pillar-only dir (no state of its own) consumed by both
+#     zeek and suricata via macros, so a bpf pillar change re-applies both.
+#   - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have
+#     their own pillar dirs AND their own state, so they map 1:1 (or 1:2
+#     in strelka's case, because of the split init.sls / manager.sls).
+#   - `data` and `node_data` pillar dirs are intentionally omitted --
+#     they're pillar-only data consumed by many states; trying to handle
+#     them generically would amount to a highstate.
+#
+# The role sets here were verified line-by-line against salt/top.sls. If
+# salt/top.sls changes how an app is targeted, update the corresponding
+# compound here.
+
+# firewall: the one pillar everyone touches. Applied everywhere intentionally
+# because every host's iptables needs to know about every other host in the
+# grid. Salt's firewall state is idempotent (file.managed + iptables-restore
+# onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is
+# unchanged do a file comparison and no-op without touching iptables -- actual
+# reload happens only on the hosts whose rules actually changed. Fleetwide
+# blast radius is intentional and matches the pre-plan behavior via highstate.
+# Adding N sensors in a burst coalesces into one dispatch via the drainer.
+firewall:
+  - state: firewall
+    tgt: '*'
+
+# bpf is pillar-only (no state); consumed by both zeek and suricata as macros.
+# Both states run on sensor_roles + so-import per salt/top.sls.
+bpf:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# ca is applied universally.
+ca:
+  - state: ca
+    tgt: '*'
+
+# elastalert: eval, standalone, manager, managerhype, managersearch (NOT import).
+elastalert:
+  - state: elastalert
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elasticsearch: 8 roles.
+elasticsearch:
+  - state: elasticsearch
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone'
+
+# elasticagent: so-heavynode only.
+elasticagent:
+  - state: elasticagent
+    tgt: 'G@role:so-heavynode'
+
+# elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid
+# is a deploy/enrollment step, not a config reload; leave it to the next highstate.
+elasticfleet:
+  - state: elasticfleet
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# healthcheck: eval, sensor, standalone only.
+healthcheck:
+  - state: healthcheck
+    tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone'
+
+# influxdb: manager_roles exactly.
+influxdb:
+  - state: influxdb
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kafka: standalone, manager, managerhype, managersearch, searchnode, receiver.
+kafka:
+  - state: kafka
+    tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# kibana: manager_roles exactly.
+kibana:
+  - state: kibana
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# logstash: 8 roles, no eval/import.
+logstash:
+  - state: logstash
+    tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop.
+nginx:
+  - state: nginx
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver.
+# (NOT eval, NOT import, NOT searchnode.)
+redis:
+  - state: redis
+    tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone'
+
+# soc: manager_roles exactly.
+soc:
+  - state: soc
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is
+# intentionally NOT fired on pillar changes -- YARA rule and strelka config
+# pillar changes are consumed by the sensor-side strelka backend, and re-running
+# strelka.manager on managers is both unnecessary and disruptive. strelka.manager
+# is left to the 2-hour highstate.
+strelka:
+  - state: strelka
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone'
+
+# suricata: sensor_roles + so-import (5 roles).
+suricata:
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# telegraf: universal.
+telegraf:
+  - state: telegraf
+    tgt: '*'
+
+# zeek: sensor_roles + so-import (5 roles).
+zeek:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
@@ -0,0 +1,170 @@
+#!py
+
+# Reactor invoked by the inotify beacon on pillar file changes under
+# /opt/so/saltstack/local/pillar/.
+#
+# Two branches:
+#   A) per-minion override under pillar/minions/<id>.sls or adv_<id>.sls
+#      -> write an intent that runs state.highstate on just that minion.
+#   B) shared app pillar (pillar/<app>/...) -> look up <app> in
+#      pillar_push_map.yaml and write an intent with the entry's actions.
+#
+# Reactors never dispatch directly. The so-push-drainer schedule picks up
+# ready intents, dedupes across pending files, and dispatches orch.push_batch.
+# See plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+import salt.client
+import yaml
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+PILLAR_ROOT = '/opt/so/saltstack/local/pillar/'
+MINIONS_PREFIX = PILLAR_ROOT + 'minions/'
+
+# The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the
+# master, which mounts the default saltstack tree at this path.
+PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml'
+
+_PUSH_MAP_CACHE = {'mtime': 0, 'data': None}
+
+
+def _load_push_map():
+    try:
+        st = os.stat(PUSH_MAP_PATH)
+    except OSError:
+        LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH)
+        return {}
+    if _PUSH_MAP_CACHE['mtime'] != st.st_mtime:
+        try:
+            with open(PUSH_MAP_PATH, 'r') as f:
+                _PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {}
+        except Exception:
+            LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH)
+            _PUSH_MAP_CACHE['data'] = {}
+        _PUSH_MAP_CACHE['mtime'] = st.st_mtime
+    return _PUSH_MAP_CACHE['data'] or {}
+
+
+def _push_enabled():
+    try:
+        caller = salt.client.Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_pillar: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_pillar: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def _minion_id_from_path(path):
+    # path is e.g. /opt/so/saltstack/local/pillar/minions/sensor1.sls
+    #          or /opt/so/saltstack/local/pillar/minions/adv_sensor1.sls
+    filename = os.path.basename(path)
+    if not filename.endswith('.sls'):
+        return None
+    stem = filename[:-4]
+    if stem.startswith('adv_'):
+        stem = stem[4:]
+    return stem or None
+
+
+def _app_from_path(path):
+    # path is e.g. /opt/so/saltstack/local/pillar/zeek/soc_zeek.sls -> 'zeek'
+    remainder = path[len(PILLAR_ROOT):]
+    if '/' not in remainder:
+        return None
+    return remainder.split('/', 1)[0] or None
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_pillar: push disabled, skipping')
+        return {}
+
+    path = data.get('data', {}).get('path', '')  # noqa: F821 -- data provided by reactor
+    if not path or not path.startswith(PILLAR_ROOT):
+        LOG.debug('push_pillar: ignoring path outside pillar root: %s', path)
+        return {}
+
+    # Branch A: per-minion override
+    if path.startswith(MINIONS_PREFIX):
+        minion_id = _minion_id_from_path(path)
+        if not minion_id:
+            LOG.debug('push_pillar: ignoring non-sls path under minions/: %s', path)
+            return {}
+        actions = [{'highstate': True, 'tgt': minion_id, 'tgt_type': 'glob'}]
+        _write_intent('minion_{}'.format(minion_id), actions, path)
+        LOG.info('push_pillar: per-minion intent updated for %s (path=%s)', minion_id, path)
+        return {}
+
+    # Branch B: shared app pillar -> allowlist lookup
+    app = _app_from_path(path)
+    if not app:
+        LOG.debug('push_pillar: ignoring path with no app segment: %s', path)
+        return {}
+
+    push_map = _load_push_map()
+    entry = push_map.get(app)
+    if not entry:
+        LOG.warning(
+            'push_pillar: pillar dir "%s" is not in pillar_push_map.yaml; '
+            'change will be picked up at the next scheduled highstate (path=%s)',
+            app, path,
+        )
+        return {}
+
+    actions = list(entry)  # copy to avoid mutating the cache
+    _write_intent('pillar_{}'.format(app), actions, path)
+    LOG.info('push_pillar: app intent updated for %s (path=%s)', app, path)
+    return {}
@@ -0,0 +1,96 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/strelka/rules/compiled/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+import salt.client
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side
+# strelka runs on exactly these four roles; so-import gets strelka.manager
+# instead, which is not fired on pillar changes.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES)
+
+
+def _push_enabled():
+    try:
+        caller = salt.client.Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_strelka: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_strelka: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_strelka: push disabled, skipping')
+        return {}
+
+    path = data.get('data', {}).get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'strelka', 'tgt': _sensor_compound()}]
+    _write_intent('rules_strelka', actions, path)
+    LOG.info('push_strelka: intent updated for path=%s', path)
+    return {}
@@ -0,0 +1,95 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/suricata/rules/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+import salt.client
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also
+# runs on so-import per salt/top.sls, so that role is appended below.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound_plus_import():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import'
+
+
+def _push_enabled():
+    try:
+        caller = salt.client.Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_suricata: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_suricata: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_suricata: push disabled, skipping')
+        return {}
+
+    path = data.get('data', {}).get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}]
+    _write_intent('rules_suricata', actions, path)
+    LOG.info('push_suricata: intent updated for path=%s', path)
+    return {}