From a0cf0489d6b1720ecb5b7c9eb0c5e1ab517bfb9e Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Fri, 10 Apr 2026 15:43:16 -0400 Subject: [PATCH] reduce highstate frequency with active push for rules and pillars - schedule highstate every 2 hours (was 15 minutes); interval lives in global:push:highstate_interval_hours so the SOC admin UI can tune it and so-salt-minion-check derives its threshold as (interval + 1) * 3600 - add inotify beacon on the manager + master reactor + orch.push_batch that writes per-app intent files, with a so-push-drainer schedule on the manager that debounces, dedupes, and dispatches a single orchestration - pillar_push_map.yaml allowlists the apps whose pillar changes trigger an immediate targeted state.apply (targets verified against salt/top.sls); edits under pillar/minions/ trigger a state.highstate on that one minion - host-batch every push orchestration (batch: 25%, batch_wait: 15) so rule changes don't thundering-herd large fleets - new global:push:enabled kill-switch tears down the beacon, reactor config, and drainer schedule on the next highstate for operators who want to keep highstate-only behavior - set restart_policy: unless-stopped on 23 container states so docker recovers crashes without waiting for the next highstate; leave registry (always), strelka/backend (on-failure), kratos, and hydra alone with inline comments explaining why --- .../tools/sbin_jinja/so-salt-minion-check | 5 +- salt/elastalert/enabled.sls | 1 + .../enabled.sls | 1 + salt/elasticagent/enabled.sls | 1 + salt/elasticfleet/enabled.sls | 1 + salt/elasticsearch/enabled.sls | 1 + salt/global/defaults.yaml | 9 +- salt/global/soc_global.yaml | 37 +++ salt/hydra/enabled.sls | 1 + salt/idh/enabled.sls | 1 + salt/influxdb/enabled.sls | 1 + salt/kafka/enabled.sls | 1 + salt/kibana/enabled.sls | 1 + salt/kratos/enabled.sls | 1 + salt/logstash/enabled.sls | 1 + salt/manager/tools/sbin/so-push-drainer | 233 ++++++++++++++++++ salt/nginx/enabled.sls | 1 + salt/orch/push_batch.sls | 37 +++ salt/reactor/pillar_push_map.yaml | 128 ++++++++++ salt/reactor/push_pillar.sls | 170 +++++++++++++ salt/reactor/push_strelka.sls | 96 ++++++++ salt/reactor/push_suricata.sls | 95 +++++++ salt/redis/enabled.sls | 1 + salt/registry/enabled.sls | 3 + salt/salt/beacons.sls | 22 +- salt/salt/files/beacons_pushstate.conf.jinja | 26 ++ salt/salt/files/reactor_pushstate.conf | 7 + salt/salt/master.sls | 17 ++ salt/salt/minion.defaults.yaml | 1 - salt/schedule.sls | 22 +- salt/sensoroni/enabled.sls | 1 + salt/soc/enabled.sls | 1 + salt/strelka/backend/enabled.sls | 4 + salt/strelka/coordinator/enabled.sls | 1 + salt/strelka/filestream/enabled.sls | 1 + salt/strelka/frontend/enabled.sls | 1 + salt/strelka/gatekeeper/enabled.sls | 1 + salt/strelka/manager/enabled.sls | 1 + salt/suricata/enabled.sls | 1 + salt/tcpreplay/init.sls | 1 + salt/telegraf/enabled.sls | 1 + salt/zeek/enabled.sls | 1 + 42 files changed, 927 insertions(+), 10 deletions(-) create mode 100644 salt/manager/tools/sbin/so-push-drainer create mode 100644 salt/orch/push_batch.sls create mode 100644 salt/reactor/pillar_push_map.yaml create mode 100644 salt/reactor/push_pillar.sls create mode 100644 salt/reactor/push_strelka.sls create mode 100644 salt/reactor/push_suricata.sls create mode 100644 salt/salt/files/beacons_pushstate.conf.jinja create mode 100644 salt/salt/files/reactor_pushstate.conf diff --git a/salt/common/tools/sbin_jinja/so-salt-minion-check b/salt/common/tools/sbin_jinja/so-salt-minion-check index 47d3bb7e1..3b2b32afe 100755 --- a/salt/common/tools/sbin_jinja/so-salt-minion-check +++ b/salt/common/tools/sbin_jinja/so-salt-minion-check @@ -1,5 +1,3 @@ -{% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%} - #!/bin/bash # # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one @@ -25,7 +23,8 @@ SYSTEM_START_TIME=$(date -d "$('.format(len(actions))) + try: + result = subprocess.run( + cmd, check=True, capture_output=True, text=True, timeout=60, + ) + except subprocess.CalledProcessError as exc: + log.error('dispatch failed (rc=%s): stdout=%s stderr=%s', + exc.returncode, exc.stdout, exc.stderr) + return False + except subprocess.TimeoutExpired: + log.error('dispatch timed out after 60s') + return False + except Exception: + log.exception('dispatch raised') + return False + log.info('dispatch accepted: %s', (result.stdout or '').strip()) + return True + + +def main(): + log = _make_logger() + + if not os.path.isdir(PENDING_DIR): + # Nothing to do; reactors create the dir on first use. + return 0 + + try: + push = _load_push_cfg() + except Exception: + log.exception('failed to read global:push pillar; aborting drain pass') + return 1 + + if not push.get('enabled', True): + log.debug('push disabled; exiting') + return 0 + + debounce_seconds = int(push.get('debounce_seconds', 30)) + + os.makedirs(PENDING_DIR, exist_ok=True) + lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644) + try: + fcntl.flock(lock_fd, fcntl.LOCK_EX) + + intent_files = [ + p for p in sorted(glob.glob(os.path.join(PENDING_DIR, '*.json'))) + if os.path.basename(p) != '.lock' + ] + if not intent_files: + return 0 + + now = time.time() + ready = [] + skipped = 0 + broken = [] + for path in intent_files: + intent = _read_intent(path, log) + if not isinstance(intent, dict): + broken.append(path) + continue + last_touch = intent.get('last_touch', 0) + if now - last_touch < debounce_seconds: + skipped += 1 + continue + ready.append((path, intent)) + + for path in broken: + try: + os.unlink(path) + except OSError: + pass + + if not ready: + if skipped: + log.debug('no ready intents (%d still in debounce window)', skipped) + return 0 + + combined_actions = [] + oldest_first_touch = now + all_paths = [] + for path, intent in ready: + combined_actions.extend(intent.get('actions', []) or []) + first = intent.get('first_touch', now) + if first < oldest_first_touch: + oldest_first_touch = first + all_paths.extend(intent.get('paths', []) or []) + + deduped = _dedupe_actions(combined_actions) + if not deduped: + log.warning('%d intent(s) had no usable actions; clearing', len(ready)) + for path, _ in ready: + try: + os.unlink(path) + except OSError: + pass + return 0 + + debounce_duration = now - oldest_first_touch + log.info( + 'draining %d intent(s): %d action(s) after dedupe (raw=%d), ' + 'debounce_duration=%.1fs, paths=%s', + len(ready), len(deduped), len(combined_actions), + debounce_duration, all_paths[:20], + ) + + if not _dispatch(deduped, log): + log.warning('dispatch failed; leaving intent files in place for retry') + return 1 + + for path, _ in ready: + try: + os.unlink(path) + except OSError: + log.exception('failed to remove drained intent %s', path) + + return 0 + finally: + try: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + finally: + os.close(lock_fd) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/salt/nginx/enabled.sls b/salt/nginx/enabled.sls index 2e4c9631c..c50ad8f8f 100644 --- a/salt/nginx/enabled.sls +++ b/salt/nginx/enabled.sls @@ -34,6 +34,7 @@ make-rule-dir-nginx: so-nginx: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-nginx:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - hostname: so-nginx - networks: - sobridge: diff --git a/salt/orch/push_batch.sls b/salt/orch/push_batch.sls new file mode 100644 index 000000000..9eb435cce --- /dev/null +++ b/salt/orch/push_batch.sls @@ -0,0 +1,37 @@ +{% from 'global/map.jinja' import GLOBALMERGED %} +{% set actions = salt['pillar.get']('actions', []) %} +{% set BATCH = GLOBALMERGED.push.batch %} +{% set BATCH_WAIT = GLOBALMERGED.push.batch_wait %} + +{% for action in actions %} +{% if action.get('highstate') %} +apply_highstate_{{ loop.index }}: + salt.state: + - tgt: '{{ action.tgt }}' + - tgt_type: {{ action.get('tgt_type', 'compound') }} + - highstate: True + - batch: {{ action.get('batch', BATCH) }} + - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }} + - kwarg: + queue: 2 +{% else %} +refresh_pillar_{{ loop.index }}: + salt.function: + - name: saltutil.refresh_pillar + - tgt: '{{ action.tgt }}' + - tgt_type: {{ action.get('tgt_type', 'compound') }} + +apply_{{ action.state | replace('.', '_') }}_{{ loop.index }}: + salt.state: + - tgt: '{{ action.tgt }}' + - tgt_type: {{ action.get('tgt_type', 'compound') }} + - sls: + - {{ action.state }} + - batch: {{ action.get('batch', BATCH) }} + - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }} + - kwarg: + queue: 2 + - require: + - salt: refresh_pillar_{{ loop.index }} +{% endif %} +{% endfor %} diff --git a/salt/reactor/pillar_push_map.yaml b/salt/reactor/pillar_push_map.yaml new file mode 100644 index 000000000..95fb0544e --- /dev/null +++ b/salt/reactor/pillar_push_map.yaml @@ -0,0 +1,128 @@ +# One pillar directory can map to multiple (state, tgt) actions. +# tgt is a raw salt compound expression. tgt_type is always "compound". +# Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s). +# +# Notes: +# - `bpf` is a pillar-only dir (no state of its own) consumed by both +# zeek and suricata via macros, so a bpf pillar change re-applies both. +# - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have +# their own pillar dirs AND their own state, so they map 1:1 (or 1:2 +# in strelka's case, because of the split init.sls / manager.sls). +# - `data` and `node_data` pillar dirs are intentionally omitted -- +# they're pillar-only data consumed by many states; trying to handle +# them generically would amount to a highstate. +# +# The role sets here were verified line-by-line against salt/top.sls. If +# salt/top.sls changes how an app is targeted, update the corresponding +# compound here. + +# firewall: the one pillar everyone touches. Applied everywhere intentionally +# because every host's iptables needs to know about every other host in the +# grid. Salt's firewall state is idempotent (file.managed + iptables-restore +# onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is +# unchanged do a file comparison and no-op without touching iptables -- actual +# reload happens only on the hosts whose rules actually changed. Fleetwide +# blast radius is intentional and matches the pre-plan behavior via highstate. +# Adding N sensors in a burst coalesces into one dispatch via the drainer. +firewall: + - state: firewall + tgt: '*' + +# bpf is pillar-only (no state); consumed by both zeek and suricata as macros. +# Both states run on sensor_roles + so-import per salt/top.sls. +bpf: + - state: zeek + tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone' + - state: suricata + tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone' + +# ca is applied universally. +ca: + - state: ca + tgt: '*' + +# elastalert: eval, standalone, manager, managerhype, managersearch (NOT import). +elastalert: + - state: elastalert + tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone' + +# elasticsearch: 8 roles. +elasticsearch: + - state: elasticsearch + tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone' + +# elasticagent: so-heavynode only. +elasticagent: + - state: elasticagent + tgt: 'G@role:so-heavynode' + +# elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid +# is a deploy/enrollment step, not a config reload; leave it to the next highstate. +elasticfleet: + - state: elasticfleet + tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone' + +# healthcheck: eval, sensor, standalone only. +healthcheck: + - state: healthcheck + tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone' + +# influxdb: manager_roles exactly. +influxdb: + - state: influxdb + tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone' + +# kafka: standalone, manager, managerhype, managersearch, searchnode, receiver. +kafka: + - state: kafka + tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone' + +# kibana: manager_roles exactly. +kibana: + - state: kibana + tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone' + +# logstash: 8 roles, no eval/import. +logstash: + - state: logstash + tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone' + +# nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop. +nginx: + - state: nginx + tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone' + +# redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver. +# (NOT eval, NOT import, NOT searchnode.) +redis: + - state: redis + tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone' + +# soc: manager_roles exactly. +soc: + - state: soc + tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone' + +# strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is +# intentionally NOT fired on pillar changes -- YARA rule and strelka config +# pillar changes are consumed by the sensor-side strelka backend, and re-running +# strelka.manager on managers is both unnecessary and disruptive. strelka.manager +# is left to the 2-hour highstate. +strelka: + - state: strelka + tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone' + +# suricata: sensor_roles + so-import (5 roles). +suricata: + - state: suricata + tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone' + +# telegraf: universal. +telegraf: + - state: telegraf + tgt: '*' + +# zeek: sensor_roles + so-import (5 roles). +zeek: + - state: zeek + tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone' diff --git a/salt/reactor/push_pillar.sls b/salt/reactor/push_pillar.sls new file mode 100644 index 000000000..a31fe9be4 --- /dev/null +++ b/salt/reactor/push_pillar.sls @@ -0,0 +1,170 @@ +#!py + +# Reactor invoked by the inotify beacon on pillar file changes under +# /opt/so/saltstack/local/pillar/. +# +# Two branches: +# A) per-minion override under pillar/minions/.sls or adv_.sls +# -> write an intent that runs state.highstate on just that minion. +# B) shared app pillar (pillar//...) -> look up in +# pillar_push_map.yaml and write an intent with the entry's actions. +# +# Reactors never dispatch directly. The so-push-drainer schedule picks up +# ready intents, dedupes across pending files, and dispatches orch.push_batch. +# See plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md. + +import fcntl +import json +import logging +import os +import time + +import salt.client +import yaml + +LOG = logging.getLogger(__name__) + +PENDING_DIR = '/opt/so/state/push_pending' +LOCK_FILE = os.path.join(PENDING_DIR, '.lock') +MAX_PATHS = 20 + +PILLAR_ROOT = '/opt/so/saltstack/local/pillar/' +MINIONS_PREFIX = PILLAR_ROOT + 'minions/' + +# The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the +# master, which mounts the default saltstack tree at this path. +PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml' + +_PUSH_MAP_CACHE = {'mtime': 0, 'data': None} + + +def _load_push_map(): + try: + st = os.stat(PUSH_MAP_PATH) + except OSError: + LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH) + return {} + if _PUSH_MAP_CACHE['mtime'] != st.st_mtime: + try: + with open(PUSH_MAP_PATH, 'r') as f: + _PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {} + except Exception: + LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH) + _PUSH_MAP_CACHE['data'] = {} + _PUSH_MAP_CACHE['mtime'] = st.st_mtime + return _PUSH_MAP_CACHE['data'] or {} + + +def _push_enabled(): + try: + caller = salt.client.Caller() + return bool(caller.cmd('pillar.get', 'global:push:enabled', True)) + except Exception: + LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled') + return True + + +def _write_intent(key, actions, path): + now = time.time() + try: + os.makedirs(PENDING_DIR, exist_ok=True) + except OSError: + LOG.exception('push_pillar: cannot create %s', PENDING_DIR) + return + + intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key)) + lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644) + try: + fcntl.flock(lock_fd, fcntl.LOCK_EX) + + intent = {} + if os.path.exists(intent_path): + try: + with open(intent_path, 'r') as f: + intent = json.load(f) + except (IOError, ValueError): + intent = {} + + intent.setdefault('first_touch', now) + intent['last_touch'] = now + intent['actions'] = actions + paths = intent.get('paths', []) + if path and path not in paths: + paths.append(path) + paths = paths[-MAX_PATHS:] + intent['paths'] = paths + + tmp_path = intent_path + '.tmp' + with open(tmp_path, 'w') as f: + json.dump(intent, f) + os.rename(tmp_path, intent_path) + except Exception: + LOG.exception('push_pillar: failed to write intent %s', intent_path) + finally: + try: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + finally: + os.close(lock_fd) + + +def _minion_id_from_path(path): + # path is e.g. /opt/so/saltstack/local/pillar/minions/sensor1.sls + # or /opt/so/saltstack/local/pillar/minions/adv_sensor1.sls + filename = os.path.basename(path) + if not filename.endswith('.sls'): + return None + stem = filename[:-4] + if stem.startswith('adv_'): + stem = stem[4:] + return stem or None + + +def _app_from_path(path): + # path is e.g. /opt/so/saltstack/local/pillar/zeek/soc_zeek.sls -> 'zeek' + remainder = path[len(PILLAR_ROOT):] + if '/' not in remainder: + return None + return remainder.split('/', 1)[0] or None + + +def run(): + if not _push_enabled(): + LOG.info('push_pillar: push disabled, skipping') + return {} + + path = data.get('data', {}).get('path', '') # noqa: F821 -- data provided by reactor + if not path or not path.startswith(PILLAR_ROOT): + LOG.debug('push_pillar: ignoring path outside pillar root: %s', path) + return {} + + # Branch A: per-minion override + if path.startswith(MINIONS_PREFIX): + minion_id = _minion_id_from_path(path) + if not minion_id: + LOG.debug('push_pillar: ignoring non-sls path under minions/: %s', path) + return {} + actions = [{'highstate': True, 'tgt': minion_id, 'tgt_type': 'glob'}] + _write_intent('minion_{}'.format(minion_id), actions, path) + LOG.info('push_pillar: per-minion intent updated for %s (path=%s)', minion_id, path) + return {} + + # Branch B: shared app pillar -> allowlist lookup + app = _app_from_path(path) + if not app: + LOG.debug('push_pillar: ignoring path with no app segment: %s', path) + return {} + + push_map = _load_push_map() + entry = push_map.get(app) + if not entry: + LOG.warning( + 'push_pillar: pillar dir "%s" is not in pillar_push_map.yaml; ' + 'change will be picked up at the next scheduled highstate (path=%s)', + app, path, + ) + return {} + + actions = list(entry) # copy to avoid mutating the cache + _write_intent('pillar_{}'.format(app), actions, path) + LOG.info('push_pillar: app intent updated for %s (path=%s)', app, path) + return {} diff --git a/salt/reactor/push_strelka.sls b/salt/reactor/push_strelka.sls new file mode 100644 index 000000000..b3ed30ed7 --- /dev/null +++ b/salt/reactor/push_strelka.sls @@ -0,0 +1,96 @@ +#!py + +# Reactor invoked by the inotify beacon on rule file changes under +# /opt/so/saltstack/local/salt/strelka/rules/compiled/. +# +# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json +# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes +# across pending files, and dispatches orch.push_batch. Reactors never dispatch +# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md. + +import fcntl +import json +import logging +import os +import time + +import salt.client + +LOG = logging.getLogger(__name__) + +PENDING_DIR = '/opt/so/state/push_pending' +LOCK_FILE = os.path.join(PENDING_DIR, '.lock') +MAX_PATHS = 20 + +# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side +# strelka runs on exactly these four roles; so-import gets strelka.manager +# instead, which is not fired on pillar changes. +SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone'] + + +def _sensor_compound(): + return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + + +def _push_enabled(): + try: + caller = salt.client.Caller() + return bool(caller.cmd('pillar.get', 'global:push:enabled', True)) + except Exception: + LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled') + return True + + +def _write_intent(key, actions, path): + now = time.time() + try: + os.makedirs(PENDING_DIR, exist_ok=True) + except OSError: + LOG.exception('push_strelka: cannot create %s', PENDING_DIR) + return + + intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key)) + lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644) + try: + fcntl.flock(lock_fd, fcntl.LOCK_EX) + + intent = {} + if os.path.exists(intent_path): + try: + with open(intent_path, 'r') as f: + intent = json.load(f) + except (IOError, ValueError): + intent = {} + + intent.setdefault('first_touch', now) + intent['last_touch'] = now + intent['actions'] = actions + paths = intent.get('paths', []) + if path and path not in paths: + paths.append(path) + paths = paths[-MAX_PATHS:] + intent['paths'] = paths + + tmp_path = intent_path + '.tmp' + with open(tmp_path, 'w') as f: + json.dump(intent, f) + os.rename(tmp_path, intent_path) + except Exception: + LOG.exception('push_strelka: failed to write intent %s', intent_path) + finally: + try: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + finally: + os.close(lock_fd) + + +def run(): + if not _push_enabled(): + LOG.info('push_strelka: push disabled, skipping') + return {} + + path = data.get('data', {}).get('path', '') # noqa: F821 -- data provided by reactor + actions = [{'state': 'strelka', 'tgt': _sensor_compound()}] + _write_intent('rules_strelka', actions, path) + LOG.info('push_strelka: intent updated for path=%s', path) + return {} diff --git a/salt/reactor/push_suricata.sls b/salt/reactor/push_suricata.sls new file mode 100644 index 000000000..c9c6eee92 --- /dev/null +++ b/salt/reactor/push_suricata.sls @@ -0,0 +1,95 @@ +#!py + +# Reactor invoked by the inotify beacon on rule file changes under +# /opt/so/saltstack/local/salt/suricata/rules/. +# +# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json +# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes +# across pending files, and dispatches orch.push_batch. Reactors never dispatch +# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md. + +import fcntl +import json +import logging +import os +import time + +import salt.client + +LOG = logging.getLogger(__name__) + +PENDING_DIR = '/opt/so/state/push_pending' +LOCK_FILE = os.path.join(PENDING_DIR, '.lock') +MAX_PATHS = 20 + +# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also +# runs on so-import per salt/top.sls, so that role is appended below. +SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone'] + + +def _sensor_compound_plus_import(): + return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import' + + +def _push_enabled(): + try: + caller = salt.client.Caller() + return bool(caller.cmd('pillar.get', 'global:push:enabled', True)) + except Exception: + LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled') + return True + + +def _write_intent(key, actions, path): + now = time.time() + try: + os.makedirs(PENDING_DIR, exist_ok=True) + except OSError: + LOG.exception('push_suricata: cannot create %s', PENDING_DIR) + return + + intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key)) + lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644) + try: + fcntl.flock(lock_fd, fcntl.LOCK_EX) + + intent = {} + if os.path.exists(intent_path): + try: + with open(intent_path, 'r') as f: + intent = json.load(f) + except (IOError, ValueError): + intent = {} + + intent.setdefault('first_touch', now) + intent['last_touch'] = now + intent['actions'] = actions + paths = intent.get('paths', []) + if path and path not in paths: + paths.append(path) + paths = paths[-MAX_PATHS:] + intent['paths'] = paths + + tmp_path = intent_path + '.tmp' + with open(tmp_path, 'w') as f: + json.dump(intent, f) + os.rename(tmp_path, intent_path) + except Exception: + LOG.exception('push_suricata: failed to write intent %s', intent_path) + finally: + try: + fcntl.flock(lock_fd, fcntl.LOCK_UN) + finally: + os.close(lock_fd) + + +def run(): + if not _push_enabled(): + LOG.info('push_suricata: push disabled, skipping') + return {} + + path = data.get('data', {}).get('path', '') # noqa: F821 -- data provided by reactor + actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}] + _write_intent('rules_suricata', actions, path) + LOG.info('push_suricata: intent updated for path=%s', path) + return {} diff --git a/salt/redis/enabled.sls b/salt/redis/enabled.sls index 4cea8d028..1c9d1afa0 100644 --- a/salt/redis/enabled.sls +++ b/salt/redis/enabled.sls @@ -17,6 +17,7 @@ include: so-redis: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - hostname: so-redis - user: socore - networks: diff --git a/salt/registry/enabled.sls b/salt/registry/enabled.sls index fc5021910..ab09d1f99 100644 --- a/salt/registry/enabled.sls +++ b/salt/registry/enabled.sls @@ -21,6 +21,9 @@ so-dockerregistry: - networks: - sobridge: - ipv4_address: {{ DOCKERMERGED.containers['so-dockerregistry'].ip }} + # Intentionally `always` (not unless-stopped) -- registry is critical infra + # and must come back up even if it was manually stopped. Do not homogenize + # to unless-stopped; see the container auto-restart section of the plan. - restart_policy: always - port_bindings: {% for BINDING in DOCKERMERGED.containers['so-dockerregistry'].port_bindings %} diff --git a/salt/salt/beacons.sls b/salt/salt/beacons.sls index df6198d01..f3e519edd 100644 --- a/salt/salt/beacons.sls +++ b/salt/salt/beacons.sls @@ -1,3 +1,5 @@ +{% from 'vars/globals.map.jinja' import GLOBALS %} +{% from 'global/map.jinja' import GLOBALMERGED %} {% set CHECKS = salt['pillar.get']('healthcheck:checks', {}) %} {% set ENABLED = salt['pillar.get']('healthcheck:enabled', False) %} {% set SCHEDULE = salt['pillar.get']('healthcheck:schedule', 30) %} @@ -14,12 +16,28 @@ salt_beacons: - defaults: CHECKS: {{ CHECKS }} SCHEDULE: {{ SCHEDULE }} - - watch_in: + - watch_in: - service: salt_minion_service {% else %} salt_beacons: file.absent: - name: /etc/salt/minion.d/beacons.conf - - watch_in: + - watch_in: + - service: salt_minion_service +{% endif %} + +{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %} +salt_beacons_pushstate: + file.managed: + - name: /etc/salt/minion.d/beacons_pushstate.conf + - source: salt://salt/files/beacons_pushstate.conf.jinja + - template: jinja + - watch_in: + - service: salt_minion_service +{% else %} +salt_beacons_pushstate: + file.absent: + - name: /etc/salt/minion.d/beacons_pushstate.conf + - watch_in: - service: salt_minion_service {% endif %} diff --git a/salt/salt/files/beacons_pushstate.conf.jinja b/salt/salt/files/beacons_pushstate.conf.jinja new file mode 100644 index 000000000..8d3f05864 --- /dev/null +++ b/salt/salt/files/beacons_pushstate.conf.jinja @@ -0,0 +1,26 @@ +beacons: + inotify: + - disable_during_state_run: True + - coalesce: True + - files: + /opt/so/saltstack/local/salt/suricata/rules/: + mask: + - close_write + - moved_to + - delete + recurse: True + auto_add: True + /opt/so/saltstack/local/salt/strelka/rules/compiled/: + mask: + - close_write + - moved_to + - delete + recurse: True + auto_add: True + /opt/so/saltstack/local/pillar/: + mask: + - close_write + - moved_to + - delete + recurse: True + auto_add: True diff --git a/salt/salt/files/reactor_pushstate.conf b/salt/salt/files/reactor_pushstate.conf new file mode 100644 index 000000000..7d3a5a0d7 --- /dev/null +++ b/salt/salt/files/reactor_pushstate.conf @@ -0,0 +1,7 @@ +reactor: + - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules/': + - salt://reactor/push_suricata.sls + - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled/': + - salt://reactor/push_strelka.sls + - 'salt/beacon/*/inotify//opt/so/saltstack/local/pillar/': + - salt://reactor/push_pillar.sls diff --git a/salt/salt/master.sls b/salt/salt/master.sls index 895150cd7..23af779dd 100644 --- a/salt/salt/master.sls +++ b/salt/salt/master.sls @@ -10,6 +10,7 @@ # software that is protected by the license key." {% from 'allowed_states.map.jinja' import allowed_states %} +{% from 'global/map.jinja' import GLOBALMERGED %} {% if sls in allowed_states %} include: @@ -62,6 +63,22 @@ engines_config: - name: /etc/salt/master.d/engines.conf - source: salt://salt/files/engines.conf +{% if GLOBALMERGED.push.enabled %} +reactor_pushstate_config: + file.managed: + - name: /etc/salt/master.d/reactor_pushstate.conf + - source: salt://salt/files/reactor_pushstate.conf + - watch_in: + - service: salt_master_service + - order: last +{% else %} +reactor_pushstate_config: + file.absent: + - name: /etc/salt/master.d/reactor_pushstate.conf + - watch_in: + - service: salt_master_service +{% endif %} + # update the bootstrap script when used for salt-cloud salt_bootstrap_cloud: file.managed: diff --git a/salt/salt/minion.defaults.yaml b/salt/salt/minion.defaults.yaml index 11f3dab41..b258e1c1e 100644 --- a/salt/salt/minion.defaults.yaml +++ b/salt/salt/minion.defaults.yaml @@ -2,4 +2,3 @@ salt: minion: version: '3006.19' - check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default diff --git a/salt/schedule.sls b/salt/schedule.sls index c3b5d85ae..014d24460 100644 --- a/salt/schedule.sls +++ b/salt/schedule.sls @@ -1,10 +1,26 @@ -{% from 'vars/globals.map.jinja' import GLOBALS %} +{% from 'vars/globals.map.jinja' import GLOBALS %} +{% from 'global/map.jinja' import GLOBALMERGED %} highstate_schedule: schedule.present: - function: state.highstate - - minutes: 15 + - hours: {{ GLOBALMERGED.push.highstate_interval_hours }} - maxrunning: 1 {% if not GLOBALS.is_manager %} - - splay: 120 + - splay: 1800 +{% endif %} + +{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %} +push_drain_schedule: + schedule.present: + - function: cmd.run + - job_args: + - /usr/sbin/so-push-drainer + - seconds: {{ GLOBALMERGED.push.drain_interval }} + - maxrunning: 1 + - return_job: False +{% elif GLOBALS.is_manager %} +push_drain_schedule: + schedule.absent: + - name: push_drain_schedule {% endif %} diff --git a/salt/sensoroni/enabled.sls b/salt/sensoroni/enabled.sls index 7790574f6..db4b91dd0 100644 --- a/salt/sensoroni/enabled.sls +++ b/salt/sensoroni/enabled.sls @@ -14,6 +14,7 @@ include: so-sensoroni: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - network_mode: host - binds: - /nsm/import:/nsm/import:rw diff --git a/salt/soc/enabled.sls b/salt/soc/enabled.sls index 1805bacaf..1c736eddd 100644 --- a/salt/soc/enabled.sls +++ b/salt/soc/enabled.sls @@ -18,6 +18,7 @@ include: so-soc: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - hostname: soc - name: so-soc - networks: diff --git a/salt/strelka/backend/enabled.sls b/salt/strelka/backend/enabled.sls index ca3f0e6dc..8c71bdf68 100644 --- a/salt/strelka/backend/enabled.sls +++ b/salt/strelka/backend/enabled.sls @@ -47,6 +47,10 @@ strelka_backend: - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }} {% endfor %} {% endif %} + # Intentionally `on-failure` (not unless-stopped) -- strelka backend shuts + # down cleanly during rule reloads and we do not want those clean exits to + # trigger an auto-restart. Do not homogenize; see the container + # auto-restart section of the plan. - restart_policy: on-failure - watch: - file: strelkasensorcompiledrules diff --git a/salt/strelka/coordinator/enabled.sls b/salt/strelka/coordinator/enabled.sls index 6756a324c..8393d7e68 100644 --- a/salt/strelka/coordinator/enabled.sls +++ b/salt/strelka/coordinator/enabled.sls @@ -15,6 +15,7 @@ include: strelka_coordinator: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - name: so-strelka-coordinator - networks: - sobridge: diff --git a/salt/strelka/filestream/enabled.sls b/salt/strelka/filestream/enabled.sls index b03faf4b1..c01171565 100644 --- a/salt/strelka/filestream/enabled.sls +++ b/salt/strelka/filestream/enabled.sls @@ -15,6 +15,7 @@ include: strelka_filestream: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - binds: - /opt/so/conf/strelka/filestream/:/etc/strelka/:ro - /nsm/strelka:/nsm/strelka diff --git a/salt/strelka/frontend/enabled.sls b/salt/strelka/frontend/enabled.sls index 58e703898..f2d0eecd1 100644 --- a/salt/strelka/frontend/enabled.sls +++ b/salt/strelka/frontend/enabled.sls @@ -15,6 +15,7 @@ include: strelka_frontend: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - binds: - /opt/so/conf/strelka/frontend/:/etc/strelka/:ro - /nsm/strelka/log/:/var/log/strelka/:rw diff --git a/salt/strelka/gatekeeper/enabled.sls b/salt/strelka/gatekeeper/enabled.sls index 45b6e467e..19d74a6d8 100644 --- a/salt/strelka/gatekeeper/enabled.sls +++ b/salt/strelka/gatekeeper/enabled.sls @@ -15,6 +15,7 @@ include: strelka_gatekeeper: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - name: so-strelka-gatekeeper - networks: - sobridge: diff --git a/salt/strelka/manager/enabled.sls b/salt/strelka/manager/enabled.sls index 7c73452d8..272767928 100644 --- a/salt/strelka/manager/enabled.sls +++ b/salt/strelka/manager/enabled.sls @@ -15,6 +15,7 @@ include: strelka_manager: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - binds: - /opt/so/conf/strelka/manager/:/etc/strelka/:ro {% if DOCKERMERGED.containers['so-strelka-manager'].custom_bind_mounts %} diff --git a/salt/suricata/enabled.sls b/salt/suricata/enabled.sls index d9d7f32ae..10c04e5b9 100644 --- a/salt/suricata/enabled.sls +++ b/salt/suricata/enabled.sls @@ -18,6 +18,7 @@ so-suricata: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-suricata:{{ GLOBALS.so_version }} - privileged: True + - restart_policy: unless-stopped - environment: - INTERFACE={{ GLOBALS.sensor.interface }} {% if DOCKERMERGED.containers['so-suricata'].extra_env %} diff --git a/salt/tcpreplay/init.sls b/salt/tcpreplay/init.sls index 7d739d00c..f5b1b05d9 100644 --- a/salt/tcpreplay/init.sls +++ b/salt/tcpreplay/init.sls @@ -7,6 +7,7 @@ so-tcpreplay: docker_container.running: - network_mode: "host" - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-tcpreplay:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - name: so-tcpreplay - user: root - interactive: True diff --git a/salt/telegraf/enabled.sls b/salt/telegraf/enabled.sls index fc9946149..6a063e08b 100644 --- a/salt/telegraf/enabled.sls +++ b/salt/telegraf/enabled.sls @@ -18,6 +18,7 @@ include: so-telegraf: docker_container.running: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-telegraf:{{ GLOBALS.so_version }} + - restart_policy: unless-stopped - user: 939 - group_add: 939,920 - environment: diff --git a/salt/zeek/enabled.sls b/salt/zeek/enabled.sls index ee78714c8..355e555b3 100644 --- a/salt/zeek/enabled.sls +++ b/salt/zeek/enabled.sls @@ -18,6 +18,7 @@ so-zeek: - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-zeek:{{ GLOBALS.so_version }} - start: True - privileged: True + - restart_policy: unless-stopped {% if DOCKERMERGED.containers['so-zeek'].ulimits %} - ulimits: {% for ULIMIT in DOCKERMERGED.containers['so-zeek'].ulimits %}