allow full highstate on manager while master locked

add some logging
move so-salt-minion-wait
2026-06-09 20:06:09 +02:00 · 2026-06-02 13:58:38 -04:00 · 2026-06-02 10:44:17 -04:00 · 2026-06-01 14:48:54 -04:00 · 2026-05-29 14:55:13 -04:00 · 2026-05-28 14:01:42 -04:00
71 changed files with 1441 additions and 117 deletions
@@ -11,6 +11,7 @@ body:
        -
        - 3.0.0
        - 3.1.0
+        - 3.2.0
        - Other (please provide detail below)
    validations:
      required: true
@@ -1 +1 @@
-20260528
+
@@ -1 +1 @@
-3.1.0
+3.2.0
@@ -0,0 +1,142 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Custom salt beacon that watches the SOC audit_settings table in postgres for
+# new settings changes and emits a beacon event per new row. This replaces the
+# inotify watch on /opt/so/saltstack/local/pillar -- instead of monitoring pillar
+# files on disk, we monitor the so_soc.audit_settings table that SOC writes to.
+#
+# Detection is poll-based with a monotonic `id` watermark persisted to
+# WATERMARK_FILE: each pass selects rows with id greater than the last id seen,
+# which makes it self-healing (a missed poll simply catches up on the next one).
+#
+# Each emitted event carries setting_id and node_id; the push_pillar reactor maps
+# setting_id -> app via pillar_push_map.yaml and writes a push intent, after which
+# the existing so-push-drainer / orch.push_batch pipeline takes over unchanged.
+
+import logging
+import os
+import subprocess
+
+log = logging.getLogger(__name__)
+
+WATERMARK_FILE = '/opt/so/state/pillar_db_watch.id'
+CONTAINER = 'so-postgres'
+DATABASE = 'so_soc'
+
+# Unaligned, tuples-only psql output with a field separator that cannot appear in
+# an id/setting_id/node_id, so we can split each row reliably.
+FIELD_SEP = '\x1f'
+
+
+def __virtual__():
+    return True
+
+
+def validate(config):
+    return True, 'valid'
+
+
+def _read_watermark():
+    # Returns the last processed id, or None if the watermark has not been seeded.
+    try:
+        with open(WATERMARK_FILE, 'r') as f:
+            return int((f.read() or '').strip())
+    except (IOError, ValueError):
+        return None
+
+
+def _write_watermark(value):
+    try:
+        os.makedirs(os.path.dirname(WATERMARK_FILE), exist_ok=True)
+        tmp = WATERMARK_FILE + '.tmp'
+        with open(tmp, 'w') as f:
+            f.write(str(int(value)))
+        os.rename(tmp, WATERMARK_FILE)
+    except OSError:
+        log.exception('pillar_db beacon: failed to persist watermark to %s', WATERMARK_FILE)
+
+
+def _query(sql):
+    # Run a query against so_soc inside the so-postgres container over the unix
+    # socket (trust auth, no password). Returns stdout on success, or None on any
+    # failure so the caller can no-op and retry on the next interval.
+    cmd = [
+        'docker', 'exec', CONTAINER,
+        'psql', '-U', 'postgres', '-d', DATABASE,
+        '-tA', '-F', FIELD_SEP, '-c', sql,
+    ]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    except subprocess.TimeoutExpired:
+        log.warning('pillar_db beacon: psql timed out')
+        return None
+    except Exception:
+        log.exception('pillar_db beacon: failed to exec psql')
+        return None
+    if result.returncode != 0:
+        log.warning('pillar_db beacon: psql failed (rc=%s): %s',
+                    result.returncode, (result.stderr or '').strip())
+        return None
+    return result.stdout
+
+
+def beacon(config):
+    retval = []
+
+    watermark = _read_watermark()
+
+    # First run / missing watermark: seed to the current MAX(id) and emit nothing
+    # so we never replay the entire settings history into a fleetwide push.
+    if watermark is None:
+        seed = _query('SELECT COALESCE(MAX(id), 0) FROM audit_settings;')
+        if seed is None:
+            return retval  # postgres not ready yet; retry next interval
+        try:
+            _write_watermark(int((seed or '0').strip() or 0))
+        except ValueError:
+            log.warning('pillar_db beacon: could not parse MAX(id) seed: %r', seed)
+        return retval
+
+    rows = _query(
+        "SELECT id, setting_id, COALESCE(node_id, '') FROM audit_settings "
+        "WHERE id > %d ORDER BY id;" % watermark
+    )
+    if rows is None:
+        return retval
+
+    max_id = watermark
+    for line in rows.splitlines():
+        # Do NOT str.strip() the whole line: Python treats the \x1f field
+        # separator (and \x1c-\x1e) as whitespace, so stripping would eat an
+        # empty trailing node_id field and make the row look malformed.
+        if not line.strip():
+            continue
+        parts = line.split(FIELD_SEP)
+        if len(parts) < 3:
+            log.warning('pillar_db beacon: skipping malformed row: %r', line)
+            continue
+        try:
+            row_id = int(parts[0])
+        except ValueError:
+            log.warning('pillar_db beacon: skipping row with non-int id: %r', line)
+            continue
+        setting_id = parts[1]
+        node_id = parts[2]
+        retval.append({
+            'tag': 'audit_settings',
+            'id': row_id,
+            'setting_id': setting_id,
+            'node_id': node_id,
+        })
+        if row_id > max_id:
+            max_id = row_id
+
+    if max_id > watermark:
+        _write_watermark(max_id)
+        log.info('pillar_db beacon: emitted %d change(s), watermark %d -> %d',
+                 len(retval), watermark, max_id)
+
+    return retval
@@ -26,33 +26,14 @@ commonpkgs:
      - net-tools
      - nmap-ncat
      - procps-ng
-{# OL10 test path: python3-docker / python3-m2crypto are not packaged in EPEL 10 and are not
-   referenced by SO code (salt uses its bundled docker module from salt/python_modules.sls).
-   python3-rich is also unavailable on EL10 (its pygments dep is not packaged), so it is
-   installed via pip below. Gate on the grain because GLOBALS/pillars are not available this
-   early (see header note). #}
-{% if grains['osmajorrelease']|int < 10 %}
      - python3-docker
      - python3-m2crypto
-      - python3-rich
-{% else %}
-      - python3-pip
-{% endif %}
      - python3-packaging
      - python3-pyyaml
+      - python3-rich
      - rsync
      - sqlite
      - tcpdump
      - unzip
      - wget
      - yum-utils
-
-{% if grains['osmajorrelease']|int >= 10 %}
-# OL10 test path: rich is not packaged for EL10; install it into the system python3 for so-status.
-commonpkgs_pip_rich:
-  cmd.run:
-    - name: python3 -m pip install rich
-    - unless: python3 -c "import rich"
-    - require:
-      - pkg: commonpkgs
-{% endif %}
@@ -354,12 +354,7 @@ gpg_rpm_import() {
 	else
 		local RPMKEYSLOC="$UPDATE_DIR/salt/repo/client/files/$OS/keys"
 	fi
-	if [[ "$OSVER" == "10" ]]; then
-		# OL10 test path uses public repos; the public oracle-epel-release and docker repos provide their own keys
-		RPMKEYS=('RPM-GPG-KEY-oracle' 'SALT-PROJECT-GPG-PUBKEY-2023.pub')
-	else
-		RPMKEYS=('RPM-GPG-KEY-oracle' 'RPM-GPG-KEY-EPEL-9' 'SALT-PROJECT-GPG-PUBKEY-2023.pub' 'docker.pub' 'securityonion.pub')
-	fi
+	RPMKEYS=('RPM-GPG-KEY-oracle' 'RPM-GPG-KEY-EPEL-9' 'SALT-PROJECT-GPG-PUBKEY-2023.pub' 'docker.pub' 'securityonion.pub')
 	for RPMKEY in "${RPMKEYS[@]}"; do
 		rpm --import $RPMKEYSLOC/$RPMKEY
 		echo "Imported $RPMKEY"
@@ -631,9 +626,9 @@ salt_minion_count() {
 }

 set_os() {
-	if [ -f /etc/oracle-release ] && grep -qE "release (9|10)\b" /etc/oracle-release; then
+	if [ -f /etc/redhat-release ] && grep -q "Red Hat Enterprise Linux release 9" /etc/redhat-release && [ -f /etc/oracle-release ]; then
 		OS=oracle
-		OSVER=$(grep -oE "release [0-9]+" /etc/oracle-release | grep -oE "[0-9]+")
+		OSVER=9
 		is_oracle=true
 		is_rpm=true
 	fi
@@ -112,23 +112,8 @@ update_docker_containers() {
  # does not include so-elastic-fleet since that container uses so-elastic-agent image
  local IMAGES_USING_ES_VERSION=("so-elasticsearch")

-  rm -rf $SIGNPATH >> "$LOG_FILE" 2>&1
-  mkdir -p $SIGNPATH >> "$LOG_FILE" 2>&1
-
-  # OL10 test path: GnuPG 2.4 enables the keybox daemon (keyboxd) by default, which deadlocks
-  # under the rapid sequential gpg --verify calls below ("waiting for lock ... keydb_search
-  # failed: Connection timed out ... No public key"). Editing the default homedir's common.conf
-  # is unreliable (gpg re-adds use-keyboxd when it re-initializes the homedir), so run all the
-  # image-signature gpg ops in a dedicated homedir whose pre-written common.conf leaves keyboxd
-  # off, forcing the classic keybox. Isolated from the system keyring and deterministic.
-  if [ "$OSVER" = "10" ]; then
-    export GNUPGHOME="$SIGNPATH/gnupg"
-    rm -rf "$GNUPGHOME" >> "$LOG_FILE" 2>&1
-    mkdir -p "$GNUPGHOME" >> "$LOG_FILE" 2>&1
-    chmod 700 "$GNUPGHOME"
-    echo "# keyboxd disabled for SO image signature verification on EL10" > "$GNUPGHOME/common.conf"
-    gpgconf --kill keyboxd gpg-agent >> "$LOG_FILE" 2>&1 || true
-  fi
+  rm -rf $SIGNPATH >> "$LOG_FILE" 2>&1 
+  mkdir -p $SIGNPATH >> "$LOG_FILE" 2>&1 

  # Let's make sure we have the public key
  run_check_net_err \
@@ -1,5 +1,3 @@
-{% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%}
-
 #!/bin/bash
 #
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
@@ -25,7 +23,8 @@ SYSTEM_START_TIME=$(date -d "$(</proc/uptime awk '{print $1}') seconds ago" +%s)
 LAST_HIGHSTATE_END=$([ -e "/opt/so/log/salt/lasthighstate" ] && date -r /opt/so/log/salt/lasthighstate +%s || echo 0)
 LAST_HEALTHCHECK_STATE_APPLY=$([ -e "/opt/so/log/salt/state-apply-test" ] && date -r /opt/so/log/salt/state-apply-test +%s || echo 0)
 # SETTING THRESHOLD TO ANYTHING UNDER 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
-THRESHOLD={{SALT_MINION_DEFAULTS.salt.minion.check_threshold}} #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
+# THRESHOLD is derived from the global push highstate interval + 1 hour, so the minion-check grace period tracks the schedule automatically.
+THRESHOLD=$(( ({{ salt['pillar.get']('global:push:highstate_interval_hours', 2) }} + 1) * 3600 )) #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
 THRESHOLD_DATE=$((LAST_HEALTHCHECK_STATE_APPLY+THRESHOLD))

 logCmd() {
@@ -18,18 +18,10 @@ dockergroup:
 dockerheldpackages:
  pkg.installed:
    - pkgs:
-{% if GLOBALS.os_version|int >= 10 %}
-      # OL10 test path: install latest Docker CE from the public repo (no .el9 builds available)
-      - containerd.io
-      - docker-ce
-      - docker-ce-cli
-      - docker-ce-rootless-extras
-{% else %}
      - containerd.io: 2.2.1-1.el9
      - docker-ce: 3:29.2.1-1.el9
      - docker-ce-cli: 1:29.2.1-1.el9
      - docker-ce-rootless-extras: 29.2.1-1.el9
-{% endif %}
    - hold: True
    - update_holds: True

@@ -9,7 +9,8 @@
 prune_images:
  cmd.run:
    - name: so-docker-prune
-    - order: last
+    - onlyif: command -v /usr/sbin/so-docker-prune >/dev/null 2>&1
+    - order: 9000

 {% else %}

@@ -19,6 +19,7 @@ wait_for_elasticsearch:
 so-elastalert:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastalert:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: elastalert
    - name: so-elastalert
    - user: so-elastalert
@@ -15,6 +15,7 @@ include:
 so-elastic-fleet-package-registry:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-fleet-package-registry:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-fleet-package-registry
    - hostname: Fleet-package-reg-{{ GLOBALS.hostname }}
    - detach: True
@@ -16,6 +16,7 @@ include:
 so-elastic-agent:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-agent
    - hostname: {{ GLOBALS.hostname }}
    - detach: True
@@ -42,6 +42,7 @@ elasticagent_syncartifacts:
 so-elastic-fleet:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-fleet
    - hostname: FleetServer-{{ GLOBALS.hostname }}
    - detach: True
@@ -24,6 +24,7 @@ include:
 so-elasticsearch:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elasticsearch:{{ ELASTICSEARCHMERGED.version }}
+    - restart_policy: unless-stopped
    - hostname: elasticsearch
    - name: so-elasticsearch
    - user: elasticsearch
@@ -1,3 +1,10 @@
 global:
  pcapengine: SURICATA
-  pipeline: REDIS
+  pipeline: REDIS
+  push:
+    enabled: true
+    highstate_interval_hours: 2
+    debounce_seconds: 30
+    drain_interval: 15
+    batch: '25%'
+    batch_wait: 15
@@ -59,4 +59,41 @@ global:
    description: Allows use of Endgame with Security Onion. This feature requires a license from Endgame.
    global: True
    advanced: True
+  push:
+    enabled:
+      description: Master kill-switch for the active push feature. When disabled, rule and pillar changes are picked up at the next scheduled highstate instead of being pushed immediately.
+      forcedType: bool
+      helpLink: push
+      global: True
+    highstate_interval_hours:
+      description: How often every minion in the grid runs a scheduled state.highstate, in hours. Lower values keep minions closer in sync at the cost of more load; higher values reduce load but increase worst-case latency for non-pushed changes. The salt-minion health check restarts a minion if its last highstate is older than this value plus one hour.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    debounce_seconds:
+      description: Trailing-edge debounce window in seconds. A push intent must be quiet for this long before the drainer dispatches. Rapid bursts of edits within this window coalesce into one dispatch.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    drain_interval:
+      description: How often the push drainer checks for ready intents, in seconds. Small values lower dispatch latency at the cost of more background work on the manager.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    batch:
+      description: "Host batch size for push orchestrations. A number (e.g. '10') or a percentage (e.g. '25%'). Limits how many minions run the push state at once so large fleets don't thundering-herd."
+      helpLink: push
+      global: True
+      advanced: True
+      regex: '^([0-9]+%?)$'
+      regexFailureMessage: Enter a whole number or a whole-number percentage (e.g. 10 or 25%).
+    batch_wait:
+      description: Seconds to wait between host batches in a push orchestration. Gives the fleet time to breathe between waves.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True

@@ -58,6 +58,7 @@ so-hydra:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: hydraconfig
@@ -15,6 +15,7 @@ include:
 so-idh:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-idh:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-idh
    - detach: True
    - network_mode: host
@@ -18,6 +18,7 @@ include:
 so-influxdb:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-influxdb:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: influxdb
    - networks:
      - sobridge:
@@ -27,6 +27,7 @@ include:
 so-kafka:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kafka:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-kafka
    - name: so-kafka
    - networks:
@@ -16,6 +16,7 @@ include:
 so-kibana:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kibana:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: kibana
    - user: kibana
    - networks:
@@ -51,6 +51,7 @@ so-kratos:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: kratosschema
@@ -103,7 +103,7 @@ kratos:
  config:
    session:
      lifespan: 
-        description: Defines the length of a login session.
+        description: Defines the length of a login session before it will timeout, and require a new login.
        global: True
        helpLink: kratos
      whoami:
@@ -28,6 +28,7 @@ include:
 so-logstash:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-logstash:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-logstash
    - name: so-logstash
    - networks:
@@ -0,0 +1,21 @@
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'global/map.jinja' import GLOBALMERGED %}
+
+include:
+  - salt.minion
+
+{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
+salt_beacons_pushstate:
+  file.managed:
+    - name: /etc/salt/minion.d/beacons_pushstate.conf
+    - source: salt://manager/files/beacons_pushstate.conf.jinja
+    - template: jinja
+    - watch_in:
+      - service: salt_minion_service
+{% else %}
+salt_beacons_pushstate:
+  file.absent:
+    - name: /etc/salt/minion.d/beacons_pushstate.conf
+    - watch_in:
+      - service: salt_minion_service
+{% endif %}
@@ -0,0 +1,41 @@
+{% from 'global/map.jinja' import GLOBALMERGED %}
+beacons:
+  pillar_db:
+    - interval: {{ GLOBALMERGED.push.drain_interval }}
+    - disable_during_state_run: True
+  inotify:
+    - disable_during_state_run: True
+    - coalesce: True
+    - files:
+        /opt/so/saltstack/local/salt/suricata/rules:
+          mask:
+            - close_write
+            - moved_to
+            - delete
+          recurse: True
+          auto_add: True
+          exclude:
+            - '\.sw[a-z]$':
+                regex: True
+            - '~$':
+                regex: True
+            - '/4913$':
+                regex: True
+            - '/\.#':
+                regex: True
+        /opt/so/saltstack/local/salt/strelka/rules/compiled:
+          mask:
+            - close_write
+            - moved_to
+            - delete
+          recurse: True
+          auto_add: True
+          exclude:
+            - '\.sw[a-z]$':
+                regex: True
+            - '~$':
+                regex: True
+            - '/4913$':
+                regex: True
+            - '/\.#':
+                regex: True
@@ -15,6 +15,7 @@ include:
  - manager.elasticsearch
  - manager.kibana
  - manager.managed_soc_annotations
+  - manager.beacons

 repo_log_dir:
  file.directory:
@@ -231,6 +232,7 @@ surifiltersrules:
    - user: 939
    - group: 939

+
 {% else %}

 {{sls}}_state_not_allowed:
@@ -0,0 +1,232 @@
+#!/opt/saltstack/salt/bin/python3
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+"""
+so-push-drainer
+===============
+
+Scheduled drainer for the active-push feature. Runs on the manager every
+drain_interval seconds (default 15) via a salt schedule in salt/schedule.sls.
+
+For each intent file under /opt/so/state/push_pending/*.json whose last_touch
+is older than debounce_seconds, this script:
+  * concatenates the actions lists from every ready intent
+  * dedupes by (state or __highstate__, tgt, tgt_type)
+  * dispatches a single `salt-run state.orchestrate orch.push_batch --async`
+    with the deduped actions list passed as pillar kwargs
+  * deletes the contributed intent files on successful dispatch
+
+Reactor sls files (push_suricata, push_strelka, push_pillar) write intents
+but never dispatch directly -- see plan
+/home/mreeves/.claude/plans/goofy-marinating-hummingbird.md for the full design.
+"""
+
+import fcntl
+import glob
+import json
+import logging
+import logging.handlers
+import os
+import subprocess
+import sys
+import time
+
+import salt.client
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+LOG_FILE = '/opt/so/log/salt/so-push-drainer.log'
+
+HIGHSTATE_SENTINEL = '__highstate__'
+
+
+def _make_logger():
+    logger = logging.getLogger('so-push-drainer')
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+        handler = logging.handlers.RotatingFileHandler(
+            LOG_FILE, maxBytes=5 * 1024 * 1024, backupCount=3,
+        )
+        handler.setFormatter(logging.Formatter(
+            '%(asctime)s | %(levelname)s | %(message)s',
+        ))
+        logger.addHandler(handler)
+    return logger
+
+
+def _load_push_cfg():
+    """Read the global:push pillar subtree via salt-call. Returns a dict."""
+    caller = salt.client.Caller()
+    cfg = caller.cmd('pillar.get', 'global:push', {})
+    return cfg if isinstance(cfg, dict) else {}
+
+
+def _read_intent(path, log):
+    try:
+        with open(path, 'r') as f:
+            return json.load(f)
+    except (IOError, ValueError) as exc:
+        log.warning('cannot read intent %s: %s', path, exc)
+        return None
+    except Exception:
+        log.exception('unexpected error reading %s', path)
+        return None
+
+
+def _dedupe_actions(actions):
+    seen = set()
+    deduped = []
+    for action in actions:
+        if not isinstance(action, dict):
+            continue
+        state_key = HIGHSTATE_SENTINEL if action.get('highstate') else action.get('state')
+        tgt = action.get('tgt')
+        tgt_type = action.get('tgt_type', 'compound')
+        if not state_key or not tgt:
+            continue
+        key = (state_key, tgt, tgt_type)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(action)
+    return deduped
+
+
+def _dispatch(actions, log):
+    pillar_arg = json.dumps({'actions': actions})
+    cmd = [
+        'salt-run',
+        'state.orchestrate',
+        'orch.push_batch',
+        'pillar={}'.format(pillar_arg),
+        '--async',
+    ]
+    log.info('dispatching: %s', ' '.join(cmd[:3]) + ' pillar=<{} actions>'.format(len(actions)))
+    try:
+        result = subprocess.run(
+            cmd, check=True, capture_output=True, text=True, timeout=60,
+        )
+    except subprocess.CalledProcessError as exc:
+        log.error('dispatch failed (rc=%s): stdout=%s stderr=%s',
+                  exc.returncode, exc.stdout, exc.stderr)
+        return False
+    except subprocess.TimeoutExpired:
+        log.error('dispatch timed out after 60s')
+        return False
+    except Exception:
+        log.exception('dispatch raised')
+        return False
+    log.info('dispatch accepted: %s', (result.stdout or '').strip())
+    return True
+
+
+def main():
+    log = _make_logger()
+
+    if not os.path.isdir(PENDING_DIR):
+        # Nothing to do; reactors create the dir on first use.
+        return 0
+
+    try:
+        push = _load_push_cfg()
+    except Exception:
+        log.exception('failed to read global:push pillar; aborting drain pass')
+        return 1
+
+    if not push.get('enabled', True):
+        log.debug('push disabled; exiting')
+        return 0
+
+    debounce_seconds = int(push.get('debounce_seconds', 30))
+
+    os.makedirs(PENDING_DIR, exist_ok=True)
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent_files = [
+            p for p in sorted(glob.glob(os.path.join(PENDING_DIR, '*.json')))
+            if os.path.basename(p) != '.lock'
+        ]
+        if not intent_files:
+            return 0
+
+        now = time.time()
+        ready = []
+        skipped = 0
+        broken = []
+        for path in intent_files:
+            intent = _read_intent(path, log)
+            if not isinstance(intent, dict):
+                broken.append(path)
+                continue
+            last_touch = intent.get('last_touch', 0)
+            if now - last_touch < debounce_seconds:
+                skipped += 1
+                continue
+            ready.append((path, intent))
+
+        for path in broken:
+            try:
+                os.unlink(path)
+            except OSError:
+                pass
+
+        if not ready:
+            if skipped:
+                log.debug('no ready intents (%d still in debounce window)', skipped)
+            return 0
+
+        combined_actions = []
+        oldest_first_touch = now
+        all_paths = []
+        for path, intent in ready:
+            combined_actions.extend(intent.get('actions', []) or [])
+            first = intent.get('first_touch', now)
+            if first < oldest_first_touch:
+                oldest_first_touch = first
+            all_paths.extend(intent.get('paths', []) or [])
+
+        deduped = _dedupe_actions(combined_actions)
+        if not deduped:
+            log.warning('%d intent(s) had no usable actions; clearing', len(ready))
+            for path, _ in ready:
+                try:
+                    os.unlink(path)
+                except OSError:
+                    pass
+            return 0
+
+        debounce_duration = now - oldest_first_touch
+        log.info(
+            'draining %d intent(s): %d action(s) after dedupe (raw=%d), '
+            'debounce_duration=%.1fs, paths=%s',
+            len(ready), len(deduped), len(combined_actions),
+            debounce_duration, all_paths[:20],
+        )
+
+        if not _dispatch(deduped, log):
+            log.warning('dispatch failed; leaving intent files in place for retry')
+            return 1
+
+        for path, _ in ready:
+            try:
+                os.unlink(path)
+            except OSError:
+                log.exception('failed to remove drained intent %s', path)
+
+        return 0
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
@@ -350,10 +350,11 @@ highstate() {
 masterlock() {
  echo "Locking Salt Master"
  mv -v $TOPFILE $BACKUPTOPFILE
-  echo "base:" > $TOPFILE
-  echo "  $MINIONID:" >> $TOPFILE
-  echo "    - ca" >> $TOPFILE
-  echo "    - elasticsearch" >> $TOPFILE
+  # Render the real top file only for the host running soup; every other
+  # minion gets an empty top (no states) while the master is upgrading.
+  echo "{% if grains['id'] == '$MINIONID' %}" > $TOPFILE
+  cat $BACKUPTOPFILE >> $TOPFILE
+  echo "{% endif %}" >> $TOPFILE
 }

 masterunlock() {
@@ -370,8 +371,9 @@ preupgrade_changes() {
    # This function is to add any new pillar items if needed.
    echo "Checking to see if changes are needed."

-    [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0   
+    [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0
    [[ "$INSTALLEDVERSION" == "3.0.0" ]] && up_to_3.1.0
+    [[ "$INSTALLEDVERSION" == "3.1.0" ]] && up_to_3.2.0
    true
 }

@@ -381,6 +383,7 @@ postupgrade_changes() {

    [[ "$POSTVERSION" =~ ^2\.4\.21[0-9]+$ ]] && post_to_3.0.0
    [[ "$POSTVERSION" == "3.0.0" ]] && post_to_3.1.0
+    [[ "$POSTVERSION" == "3.1.0" ]] && post_to_3.2.0
    true
 }

@@ -742,6 +745,42 @@ post_to_3.1.0() {

 ### 3.1.0 End ###

+### 3.2.0 Scripts ###
+
+bootstrap_so_soc_database() {
+  # init-db.sh is mounted into so-postgres at /docker-entrypoint-initdb.d/init-db.sh
+  # and runs automatically only on a fresh data directory. Hosts upgrading from
+  # 3.1.0 already have /nsm/postgres populated, so the so_soc bootstrap block
+  # added in 3.2 never fires. Re-run the script explicitly; it's idempotent.
+  echo "Bootstrapping so_soc database via init-db.sh."
+  # The postgres image has no USER directive, so `docker exec` defaults to
+  # root, and the container env intentionally omits POSTGRES_USER (the upstream
+  # entrypoint defaults it transiently during first-init only). Recreate both
+  # so psql inside init-db.sh resolves the connect user correctly.
+  local exec_cmd="docker exec -u postgres -e POSTGRES_USER=postgres so-postgres bash /docker-entrypoint-initdb.d/init-db.sh"
+  if ! /usr/sbin/so-postgres-wait; then
+    FINAL_MESSAGE_QUEUE+=("WARNING: so-postgres was not ready during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd")
+    return 0
+  fi
+  if ! $exec_cmd; then
+    FINAL_MESSAGE_QUEUE+=("WARNING: init-db.sh failed inside so-postgres during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd")
+    return 0
+  fi
+  echo "so_soc bootstrap complete."
+}
+
+up_to_3.2.0() {
+  INSTALLEDVERSION=3.2.0
+}
+
+post_to_3.2.0() {
+  bootstrap_so_soc_database
+
+  POSTVERSION=3.2.0
+}
+
+### 3.2.0 End ###
+

 repo_sync() {
  echo "Sync the local repo."
@@ -1725,6 +1764,9 @@ main() {

    enable_highstate

+    echo "salt-call state.show_top"
+    salt-call state.show_top
+
    echo ""
    echo "Running a highstate. This could take several minutes."
    set +e
@@ -1732,6 +1774,9 @@ main() {
    highstate
    set -e

+    echo "salt-call saltutil.running"
+    salt-call saltutil.running
+
    stop_salt_master

    masterunlock
@@ -1754,6 +1799,9 @@ main() {
    # ensure the mine is updated and populated before highstates run, following the salt-master restart
    update_salt_mine

+    echo "salt-call state.show_top"
+    salt-call state.show_top
+
    highstate
    check_saltmaster_status
    postupgrade_changes
@@ -34,6 +34,7 @@ make-rule-dir-nginx:
 so-nginx:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-nginx:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-nginx
    - networks:
      - sobridge:
@@ -0,0 +1,37 @@
+{% from 'global/map.jinja' import GLOBALMERGED %}
+{% set actions = salt['pillar.get']('actions', []) %}
+{% set BATCH = GLOBALMERGED.push.batch %}
+{% set BATCH_WAIT = GLOBALMERGED.push.batch_wait %}
+
+{% for action in actions %}
+{%   if action.get('highstate') %}
+apply_highstate_{{ loop.index }}:
+  salt.state:
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+    - highstate: True
+    - batch: {{ action.get('batch', BATCH) }}
+    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
+    - kwarg:
+        queue: 2
+{%   else %}
+refresh_pillar_{{ loop.index }}:
+  salt.function:
+    - name: saltutil.refresh_pillar
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+
+apply_{{ action.state | replace('.', '_') }}_{{ loop.index }}:
+  salt.state:
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+    - sls:
+      - {{ action.state }}
+    - batch: {{ action.get('batch', BATCH) }}
+    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
+    - kwarg:
+        queue: 2
+    - require:
+      - salt: refresh_pillar_{{ loop.index }}
+{%   endif %}
+{% endfor %}
@@ -32,3 +32,8 @@ EOSQL
 if ! psql -U "$POSTGRES_USER" -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
    psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -c "CREATE DATABASE so_telegraf"
 fi
+
+# Bootstrap the SOC database.
+if ! psql -U "$POSTGRES_USER" -tAc "SELECT 1 FROM pg_database WHERE datname='so_soc'" | grep -q 1; then
+    psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -c "CREATE DATABASE so_soc"
+fi
@@ -18,26 +18,12 @@ include:
 {% set TG_OUT = TELEGRAFMERGED.output | upper %}
 {% if TG_OUT in ['POSTGRES', 'BOTH'] %}

-# docker_container.running returns as soon as the container starts, but on
-# first-init docker-entrypoint.sh starts a temporary postgres with
-# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
-# shuts it down before exec'ing the real CMD. A default pg_isready check
-# (Unix socket) passes during that ephemeral phase and races the shutdown
-# with "the database system is shutting down". Checking TCP readiness on
-# 127.0.0.1 only succeeds after the final postgres binds the port.
 postgres_wait_ready:
  cmd.run:
-    - name: |
-        for i in $(seq 1 60); do
-          if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
-            exit 0
-          fi
-          sleep 2
-        done
-        echo "so-postgres did not accept TCP connections within 120s" >&2
-        exit 1
+    - name: /usr/sbin/so-postgres-wait
    - require:
      - docker_container: so-postgres
+      - file: postgres_sbin

 # Ensure the shared Telegraf database exists. init-db.sh only runs on a
 # fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Wait for the so-postgres container to accept TCP connections.
+#
+# docker_container.running returns as soon as the container starts, but on
+# first-init docker-entrypoint.sh starts a temporary postgres with
+# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
+# shuts it down before exec'ing the real CMD. A default pg_isready check
+# (Unix socket) passes during that ephemeral phase and races the shutdown
+# with "the database system is shutting down". Checking TCP readiness on
+# 127.0.0.1 only succeeds after the final postgres binds the port.
+#
+# Usage: so-postgres-wait [iterations] [sleep_seconds]
+# Default: 60 iterations, 2s sleep (~120s total).
+
+ITERATIONS=${1:-60}
+SLEEP_SECONDS=${2:-2}
+
+for i in $(seq 1 "$ITERATIONS"); do
+  if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
+    exit 0
+  fi
+  sleep "$SLEEP_SECONDS"
+done
+
+echo "so-postgres did not accept TCP connections within $((ITERATIONS * SLEEP_SECONDS))s" >&2
+exit 1
@@ -0,0 +1,240 @@
+# One pillar directory can map to multiple (state, tgt) actions.
+# tgt is a raw salt compound expression. tgt_type is always "compound".
+# Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s).
+# An action with `highstate: True` triggers state.highstate instead of
+# state.apply -- see salt/orch/push_batch.sls.
+#
+# Notes:
+#   - `bpf` is a pillar-only dir (no state of its own) consumed by both
+#     zeek and suricata via macros, so a bpf pillar change re-applies both.
+#   - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have
+#     their own pillar dirs AND their own state, so they map 1:1 (or 1:2
+#     in strelka's case, because of the split init.sls / manager.sls).
+#
+# Intentional omissions (these will log a "not in pillar_push_map.yaml"
+# warning in push_pillar.sls and wait for the next scheduled highstate):
+#   - `data` and `node_data`: pillar-only data consumed by many states;
+#     handling them generically would amount to a fleetwide highstate.
+#   - `host`: soc_host describes mainint/mainip; a change is a re-IP and
+#     needs a coordinated procedure, not an immediate state push.
+#   - `hypervisor`: state changes touch libvirt and are disruptive; leave
+#     to the next scheduled highstate.
+#   - `sensor`: every field in soc_sensor.yaml is `readonly: True` or
+#     per-minion (`node: True`). Per-minion edits are persisted under
+#     pillar/minions/<id>.sls and are handled by Branch A of push_pillar.sls
+#     (per-minion highstate intent), not by this app-pillar map.
+#
+# The role sets here were verified line-by-line against salt/top.sls. If
+# salt/top.sls changes how an app is targeted, update the corresponding
+# compound here.
+
+# firewall: the one pillar everyone touches. Applied everywhere intentionally
+# because every host's iptables needs to know about every other host in the
+# grid. Salt's firewall state is idempotent (file.managed + iptables-restore
+# onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is
+# unchanged do a file comparison and no-op without touching iptables -- actual
+# reload happens only on the hosts whose rules actually changed. Fleetwide
+# blast radius is intentional and matches the pre-plan behavior via highstate.
+# Adding N sensors in a burst coalesces into one dispatch via the drainer.
+firewall:
+  - state: firewall
+    tgt: '*'
+
+# backup: backup.config_backup runs on eval, standalone, manager, managerhype,
+# managersearch (NOT import -- the backup pillar is included on import per
+# pillar/top.sls but the backup state is not run there per salt/top.sls).
+backup:
+  - state: backup.config_backup
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# bpf is pillar-only (no state); consumed by both zeek and suricata as macros.
+# Both states run on sensor_roles + so-import per salt/top.sls.
+bpf:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# ca is applied universally.
+ca:
+  - state: ca
+    tgt: '*'
+
+# docker: universal. The docker state is in both the all-non-managers and
+# all-managers branches of salt/top.sls.
+docker:
+  - state: docker
+    tgt: '*'
+
+# elastalert: eval, standalone, manager, managerhype, managersearch (NOT import).
+elastalert:
+  - state: elastalert
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elastic-fleet-package-registry: manager_roles exactly.
+elastic-fleet-package-registry:
+  - state: elastic-fleet-package-registry
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elasticsearch: 8 roles.
+elasticsearch:
+  - state: elasticsearch
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone'
+
+# elasticagent: so-heavynode only.
+elasticagent:
+  - state: elasticagent
+    tgt: 'G@role:so-heavynode'
+
+# elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid
+# is a deploy/enrollment step, not a config reload; leave it to the next highstate.
+elasticfleet:
+  - state: elasticfleet
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# global: fanout to a fleetwide highstate. The global pillar (soc_global.sls)
+# carries cross-cutting settings (pipeline, url_base, imagerepo, mdengine, ...)
+# that are consumed by virtually every state, so a targeted re-apply isn't
+# meaningful. The drainer's batch/batch_wait throttling controls blast radius.
+global:
+  - highstate: True
+    tgt: '*'
+
+# healthcheck: eval, sensor, standalone only.
+healthcheck:
+  - state: healthcheck
+    tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone'
+
+# hydra: manager_roles exactly.
+hydra:
+  - state: hydra
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# idh: so-idh only.
+idh:
+  - state: idh
+    tgt: 'G@role:so-idh'
+
+# influxdb: manager_roles exactly.
+influxdb:
+  - state: influxdb
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kafka: standalone, manager, managerhype, managersearch, searchnode, receiver.
+kafka:
+  - state: kafka
+    tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# kibana: manager_roles exactly.
+kibana:
+  - state: kibana
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kratos: manager_roles exactly.
+kratos:
+  - state: kratos
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# logrotate: universal (top-of-file '*' branch in salt/top.sls).
+logrotate:
+  - state: logrotate
+    tgt: '*'
+
+# logstash: 8 roles, no eval/import.
+logstash:
+  - state: logstash
+    tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# manager: manager_roles exactly. The manager state is also referenced under
+# *_sensor / *_heavynode top.sls blocks via `sensor`, but the standalone
+# `manager` state itself runs only on manager_roles.
+manager:
+  - state: manager
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop.
+nginx:
+  - state: nginx
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# ntp: universal (top-of-file '*' branch in salt/top.sls).
+ntp:
+  - state: ntp
+    tgt: '*'
+
+# patch: universal. soc_patch carries the OS update schedule, applied via
+# patch.os.schedule on every node (it's in both the all-non-managers and
+# all-managers branches of salt/top.sls).
+patch:
+  - state: patch.os.schedule
+    tgt: '*'
+
+# postgres: manager_roles exactly.
+postgres:
+  - state: postgres
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver.
+# (NOT eval, NOT import, NOT searchnode.)
+redis:
+  - state: redis
+    tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone'
+
+# registry: manager_roles exactly.
+registry:
+  - state: registry
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# sensoroni: universal.
+sensoroni:
+  - state: sensoroni
+    tgt: '*'
+
+# soc: manager_roles exactly.
+soc:
+  - state: soc
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# stig: broad. Runs on standalone, manager, managerhype, managersearch,
+# searchnode, sensor, receiver, fleet, hypervisor, desktop.
+# NOT eval, NOT import, NOT heavynode, NOT idh (the *_idh block in
+# salt/top.sls intentionally omits stig).
+stig:
+  - state: stig
+    tgt: 'G@role:so-desktop or G@role:so-fleet or G@role:so-hypervisor or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is
+# intentionally NOT fired on pillar changes -- YARA rule and strelka config
+# pillar changes are consumed by the sensor-side strelka backend, and re-running
+# strelka.manager on managers is both unnecessary and disruptive. strelka.manager
+# is left to the 2-hour highstate.
+strelka:
+  - state: strelka
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone'
+
+# suricata: sensor_roles + so-import (5 roles).
+suricata:
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# telegraf: universal.
+telegraf:
+  - state: telegraf
+    tgt: '*'
+
+# versionlock: universal (top-of-file '*' branch in salt/top.sls).
+versionlock:
+  - state: versionlock
+    tgt: '*'
+
+# vm: libvirt-driver hypervisors only. Matched by the salt-cloud:driver:libvirt
+# grain (compound supports nested grain matching via G@<key>:<subkey>:<value>).
+# pillar/vm/soc_vm.sls write path is referenced at salt/_runners/setup_hypervisor.py:856.
+vm:
+  - state: vm
+    tgt: 'G@salt-cloud:driver:libvirt'
+
+# zeek: sensor_roles + so-import (5 roles).
+zeek:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
@@ -0,0 +1,176 @@
+#!py
+
+# Reactor invoked by the pillar_db beacon when SOC records settings changes in
+# the so_soc.audit_settings table (see salt/_beacons/pillar_db.py). The beacon
+# emits one event per new row carrying setting_id and node_id.
+#
+# Two branches, keyed on node_id:
+#   A) node_id populated -> the change is scoped to that one minion. Look up the
+#      app in pillar_push_map.yaml and write an intent that runs the app's mapped
+#      state(s) targeted to just that node.
+#   B) node_id empty -> grid-wide app change. Look up the app in
+#      pillar_push_map.yaml and write an intent with the entry's actions as-is.
+#
+# The app name is the first dotted segment of setting_id (e.g. "telegraf.output"
+# -> "telegraf"), which matches the pillar_push_map.yaml keys 1:1.
+#
+# Reactors never dispatch directly. The so-push-drainer schedule picks up
+# ready intents, dedupes across pending files, and dispatches orch.push_batch.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+import yaml
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the
+# master, which mounts the default saltstack tree at this path.
+PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml'
+
+_PUSH_MAP_CACHE = {'mtime': 0, 'data': None}
+
+
+def _load_push_map():
+    try:
+        st = os.stat(PUSH_MAP_PATH)
+    except OSError:
+        LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH)
+        return {}
+    if _PUSH_MAP_CACHE['mtime'] != st.st_mtime:
+        try:
+            with open(PUSH_MAP_PATH, 'r') as f:
+                _PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {}
+        except Exception:
+            LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH)
+            _PUSH_MAP_CACHE['data'] = {}
+        _PUSH_MAP_CACHE['mtime'] = st.st_mtime
+    return _PUSH_MAP_CACHE['data'] or {}
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_pillar: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_pillar: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def _app_from_setting(setting_id):
+    # setting_id is e.g. 'telegraf.output' -> 'telegraf', 'ntp.config.servers' -> 'ntp'
+    if not setting_id:
+        return None
+    return setting_id.split('.', 1)[0] or None
+
+
+def _node_actions(entry, node_id):
+    # Copy the app's mapped actions but retarget each one to the single node.
+    # Preserves the state/highstate selection and any batch/batch_wait overrides.
+    actions = []
+    for action in entry:
+        if not isinstance(action, dict):
+            continue
+        node_action = dict(action)
+        node_action['tgt'] = node_id
+        node_action['tgt_type'] = 'glob'
+        actions.append(node_action)
+    return actions
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_pillar: push disabled, skipping')
+        return {}
+
+    # The pillar_db beacon nests its payload under data['data']; fall back to the
+    # top level so the reactor is robust to either shape.
+    event = data.get('data', data)  # noqa: F821 -- data provided by reactor
+    setting_id = event.get('setting_id', '')
+    node_id = (event.get('node_id') or '').strip()
+
+    app = _app_from_setting(setting_id)
+    if not app:
+        LOG.debug('push_pillar: ignoring event with no app segment: setting_id=%s', setting_id)
+        return {}
+
+    push_map = _load_push_map()
+    entry = push_map.get(app)
+    if not entry:
+        LOG.warning(
+            'push_pillar: app "%s" is not in pillar_push_map.yaml; change will be '
+            'picked up at the next scheduled highstate (setting_id=%s)',
+            app, setting_id,
+        )
+        return {}
+
+    # Branch A: per-node change -> retarget the app's states to just that node.
+    if node_id:
+        actions = _node_actions(entry, node_id)
+        if not actions:
+            LOG.warning('push_pillar: no usable actions for app "%s" (setting_id=%s)', app, setting_id)
+            return {}
+        _write_intent(
+            'node_{}_{}'.format(node_id, app), actions,
+            'audit:{}@{}'.format(setting_id, node_id),
+        )
+        LOG.info('push_pillar: per-node intent updated for %s on %s (setting_id=%s)',
+                 app, node_id, setting_id)
+        return {}
+
+    # Branch B: grid-wide app change -> use the map entry's actions as-is.
+    actions = list(entry)  # copy to avoid mutating the cache
+    _write_intent('pillar_{}'.format(app), actions, 'audit:{}'.format(setting_id))
+    LOG.info('push_pillar: app intent updated for %s (setting_id=%s)', app, setting_id)
+    return {}
@@ -0,0 +1,96 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/strelka/rules/compiled/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side
+# strelka runs on exactly these four roles; so-import gets strelka.manager
+# instead, which is not fired on pillar changes.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES)
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_strelka: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_strelka: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_strelka: push disabled, skipping')
+        return {}
+
+    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'strelka', 'tgt': _sensor_compound()}]
+    _write_intent('rules_strelka', actions, path)
+    LOG.info('push_strelka: intent updated for path=%s', path)
+    return {}
@@ -0,0 +1,95 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/suricata/rules/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also
+# runs on so-import per salt/top.sls, so that role is appended below.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound_plus_import():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import'
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_suricata: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_suricata: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_suricata: push disabled, skipping')
+        return {}
+
+    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}]
+    _write_intent('rules_suricata', actions, path)
+    LOG.info('push_suricata: intent updated for path=%s', path)
+    return {}
@@ -17,6 +17,7 @@ include:
 so-redis:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-redis
    - user: socore
    - networks:
@@ -21,6 +21,9 @@ so-dockerregistry:
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-dockerregistry'].ip }}
+    # Intentionally `always` (not unless-stopped) -- registry is critical infra
+    # and must come back up even if it was manually stopped. Do not homogenize
+    # to unless-stopped; see the container auto-restart section of the plan.
    - restart_policy: always
    - port_bindings:
      {% for BINDING in DOCKERMERGED.containers['so-dockerregistry'].port_bindings %}
@@ -1,6 +1,5 @@
 {% from 'vars/globals.map.jinja' import GLOBALS %}
-{# OL10 test path uses public repos; skip the SO repo state (which removes public repos and points at /nsm/repo) #}
-{% if GLOBALS.os == 'OEL' and GLOBALS.os_version|int == 9 %}
+{% if GLOBALS.os == 'OEL' %}
 include:
  - repo.client.oracle
 {% endif %}
@@ -3,7 +3,7 @@
 {% set SCHEDULE = salt['pillar.get']('healthcheck:schedule', 30) %}

 include:
-  - salt
+  - salt.minion

 {% if CHECKS and ENABLED %}
 salt_beacons:
@@ -14,12 +14,13 @@ salt_beacons:
    - defaults:
        CHECKS: {{ CHECKS }}
        SCHEDULE: {{ SCHEDULE }}
-    - watch_in: 
+    - watch_in:
      - service: salt_minion_service
 {% else %}
 salt_beacons:
  file.absent:
    - name: /etc/salt/minion.d/beacons.conf
-    - watch_in: 
+    - watch_in:
      - service: salt_minion_service
 {% endif %}
+
@@ -0,0 +1,11 @@
+reactor:
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules':
+    - salt://reactor/push_suricata.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules/*':
+    - salt://reactor/push_suricata.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled':
+    - salt://reactor/push_strelka.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled/*':
+    - salt://reactor/push_strelka.sls
+  - 'salt/beacon/*/pillar_db/audit_settings':
+    - salt://reactor/push_pillar.sls
@@ -5,3 +5,11 @@ salt_bootstrap:
    - source: salt://salt/scripts/bootstrap-salt.sh
    - mode: 755
    - show_changes: False
+
+salt_sbin:
+  file.recurse:
+    - name: /usr/sbin
+    - source: salt://salt/tools/sbin
+    - user: 939
+    - group: 939
+    - file_mode: 755
@@ -1,4 +1,4 @@
 lasthighstate:
  file.touch:
    - name: /opt/so/log/salt/lasthighstate
-    - order: last
+    - order: 9001
@@ -10,10 +10,12 @@
 #    software that is protected by the license key."

 {% from 'allowed_states.map.jinja' import allowed_states %}
+{% from 'global/map.jinja' import GLOBALMERGED %}
 {% if sls in allowed_states %}

 include:
  - salt.minion
+  - salt.master.pyinotify
 {%   if 'vrt' in salt['pillar.get']('features', []) %}
  - salt.cloud
  - salt.cloud.reactor_config_hypervisor
@@ -62,6 +64,21 @@ engines_config:
    - name: /etc/salt/master.d/engines.conf
    - source: salt://salt/files/engines.conf

+{% if GLOBALMERGED.push.enabled %}
+reactor_pushstate_config:
+  file.managed:
+    - name: /etc/salt/master.d/reactor_pushstate.conf
+    - source: salt://salt/files/reactor_pushstate.conf
+    - watch_in:
+      - service: salt_master_service
+{% else %}
+reactor_pushstate_config:
+  file.absent:
+    - name: /etc/salt/master.d/reactor_pushstate.conf
+    - watch_in:
+      - service: salt_master_service
+{% endif %}
+
 # update the bootstrap script when used for salt-cloud
 salt_bootstrap_cloud:
  file.managed:
@@ -77,7 +94,7 @@ salt_master_service:
      - file: checkmine_engine
      - file: pillarWatch_engine
      - file: engines_config
-    - order: last
+    - order: 9002

 {% else %}

@@ -0,0 +1,20 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+pyinotify_module_package:
+  file.recurse:
+    - name: /opt/so/conf/salt/module_packages/pyinotify
+    - source: salt://salt/module_packages/pyinotify
+    - clean: True
+    - makedirs: True
+
+pyinotify_python_module_install:
+  cmd.run:
+    - name: /opt/saltstack/salt/bin/python3.10 -m pip install pyinotify --no-index --find-links=/opt/so/conf/salt/module_packages/pyinotify/ --upgrade
+    - onchanges:
+      - file: pyinotify_module_package
+    - failhard: True
+    - watch_in:
+      - service: salt_minion_service
@@ -2,4 +2,3 @@
 salt:
  minion:
    version: '3006.19'
-    check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
@@ -88,13 +88,17 @@ enable_startup_states:

 {% endif %}

-# this has to be outside the if statement above since there are <requisite>_in calls to this state
+# this has to be outside the if statement above since there are <requisite>_in calls to this state.
+# uses watch (not listen) so the restart fires in-state and its result lands on this state's
+# running entry; that is what lets wait_for_salt_minion_ready below detect any restart
+# uniformly via onchanges, regardless of whether the trigger came from these files or from
+# external watch_in's (e.g. beacons, master/pyinotify).
 salt_minion_service:
  service.running:
    - name: salt-minion
    - enable: True
    - onlyif: test "{{INSTALLEDSALTVERSION}}" == "{{SALTVERSION}}"
-    - listen:
+    - watch:
      - file: mine_functions
 {% if INSTALLEDSALTVERSION|string == SALTVERSION|string %}
      - file: set_log_levels
@@ -103,3 +107,17 @@ salt_minion_service:
      - file: signing_policy
 {% endif %}
    - order: last
+
+# block until the just-restarted salt-minion is back and can execute modules locally, so
+# follow-on jobs and the next highstate iteration do not race the restart. onchanges +
+# require on salt_minion_service catches every restart trigger uniformly because watch
+# mod_watch results replace the service state's running entry. wait logic lives in
+# /usr/sbin/so-salt-minion-wait (deployed by common_sbin from common/tools/sbin/).
+wait_for_salt_minion_ready:
+  cmd.run:
+    - name: /usr/sbin/so-salt-minion-wait
+    - onchanges:
+      - service: salt_minion_service
+    - require:
+      - service: salt_minion_service
+    - order: last
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Block until the local salt-minion service is back up and can execute modules locally.
+# Invoked from the wait_for_salt_minion_ready state in salt/minion/init.sls after
+# salt_minion_service fires its watch-driven mod_watch (a non-blocking systemctl restart),
+# so follow-on jobs and the next highstate iteration do not race the in-flight restart.
+
+. /usr/sbin/so-common
+
+# Initial sleep gives the systemctl restart (--no-block by default for salt-minion on
+# >=3006.15) time to begin tearing down the old process before we probe for readiness.
+INITIAL_SLEEP=3
+TIMEOUT=120
+PING_TIMEOUT=5
+
+sleep "$INITIAL_SLEEP"
+
+elapsed="$INITIAL_SLEEP"
+while [ "$elapsed" -lt "$TIMEOUT" ]; do
+  if systemctl is-active --quiet salt-minion \
+     && salt-call --local --timeout="$PING_TIMEOUT" --out=quiet test.ping >/dev/null 2>&1; then
+    echo "salt-minion ready after ${elapsed}s"
+    exit 0
+  fi
+  sleep 1
+  elapsed=$((elapsed + 1))
+done
+
+echo "salt-minion did not become ready within ${TIMEOUT}s" >&2
+exit 1
@@ -1,10 +1,26 @@
-{%   from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'global/map.jinja' import GLOBALMERGED %}

 highstate_schedule:
  schedule.present:
    - function: state.highstate
-    - minutes: 15
+    - hours: {{ GLOBALMERGED.push.highstate_interval_hours }}
    - maxrunning: 1
 {% if not GLOBALS.is_manager %}
-    - splay: 120
+    - splay: 1800
+{% endif %}
+
+{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
+push_drain_schedule:
+  schedule.present:
+    - function: cmd.run
+    - job_args:
+      - /usr/sbin/so-push-drainer
+    - seconds: {{ GLOBALMERGED.push.drain_interval }}
+    - maxrunning: 1
+    - return_job: False
+{% elif GLOBALS.is_manager %}
+push_drain_schedule:
+  schedule.absent:
+    - name: push_drain_schedule
 {% endif %}
@@ -14,6 +14,7 @@ include:
 so-sensoroni:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - network_mode: host
    - binds:
      - /nsm/import:/nsm/import:rw
@@ -1519,6 +1519,12 @@ soc:
              serviceAccountJSON: ""
              serviceAccountLocation: ""
              healthTimeoutSeconds: 5
+        onionconfig:
+          saltstackDir: /opt/so/saltstack
+          bypassEnabled: false
+        postgres:
+          host:
+          password:
        salt:
          queueDir: /opt/sensoroni/queue
          timeoutMs: 45000
@@ -18,6 +18,7 @@ include:
 so-soc:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: soc
    - name: so-soc
    - networks:
@@ -16,6 +16,13 @@
 {% do SOCMERGED.config.server.update({'additionalCA': MANAGERMERGED.additionalCA}) %}
 {% do SOCMERGED.config.server.update({'insecureSkipVerify': MANAGERMERGED.insecureSkipVerify}) %}

+{% if not SOCMERGED.config.server.modules.postgres.host %}
+{%   do SOCMERGED.config.server.modules.postgres.update({'host': GLOBALS.manager}) %}
+{% endif %}
+{% if not SOCMERGED.config.server.modules.postgres.password %}
+{%   do SOCMERGED.config.server.modules.postgres.update({'password': salt['pillar.get']('secrets:postgres_pass', '')}) %}
+{% endif %}
+
 {# if SOCMERGED.config.server.modules.cases == httpcase details come from the soc pillar #}
 {% if SOCMERGED.config.server.modules.cases != 'soc' %}
 {%   do SOCMERGED.config.server.modules.elastic.update({'casesEnabled': false}) %}
@@ -453,6 +453,26 @@ soc:
            description: Duration (in milliseconds) that must elapse after a grid node fails to check-in before the node will be marked offline (fault).
            global: True
            advanced: True
+        onionconfig:
+          saltstackDir:
+            description: Root directory containing the SaltStack tree that SOC reads and writes configuration from. Should not be changed under normal circumstances.
+            global: True
+            advanced: True
+          bypassEnabled:
+            description: When enabled, errors encountered while reading the SaltStack pillar tree (missing files, unreadable directories, etc.) are logged but do not prevent SOC from starting or serving settings. Intended for advanced troubleshooting and recovery scenarios when the pillar tree is partially unreadable.
+            global: True
+            advanced: True
+            forcedType: bool
+        postgres:
+          host:
+            description: Hostname or IP address of the PostgreSQL server used by SOC. Defaults to the manager hostname.
+            global: True
+            advanced: True
+          password:
+            description: Password used by SOC to authenticate to the PostgreSQL server. Defaults to the postgres superuser password seeded in the secrets pillar.
+            global: True
+            sensitive: True
+            advanced: True
        salt:
          longRelayTimeoutMs:
            description: Duration (in milliseconds) to wait for a response from the Salt API when executing tasks known for being long running before giving up and showing an error on the SOC UI.
@@ -818,6 +838,7 @@ soc:
          description: List of available external tools visible in the SOC UI. Each tool is defined in JSON object notation, and must include the "name" key and "link" key, where the link is the tool's URL.
          global: True
          advanced: True
+          multiline: True
          forcedType: "[]{}"
        exportNodeId:
          description: The node ID on which export jobs will be executed.
@@ -47,6 +47,10 @@ strelka_backend:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally `on-failure` (not unless-stopped) -- strelka backend shuts
+    # down cleanly during rule reloads and we do not want those clean exits to
+    # trigger an auto-restart. Do not homogenize; see the container
+    # auto-restart section of the plan.
    - restart_policy: on-failure
    - watch:
      - file: strelkasensorcompiledrules
@@ -15,6 +15,7 @@ include:
 strelka_coordinator:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-strelka-coordinator
    - networks:
      - sobridge:
@@ -15,6 +15,7 @@ include:
 strelka_filestream:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/filestream/:/etc/strelka/:ro
      - /nsm/strelka:/nsm/strelka
@@ -15,6 +15,7 @@ include:
 strelka_frontend:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/frontend/:/etc/strelka/:ro
      - /nsm/strelka/log/:/var/log/strelka/:rw
@@ -15,6 +15,7 @@ include:
 strelka_gatekeeper:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-strelka-gatekeeper
    - networks:
      - sobridge:
@@ -15,6 +15,7 @@ include:
 strelka_manager:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/manager/:/etc/strelka/:ro
      {% if DOCKERMERGED.containers['so-strelka-manager'].custom_bind_mounts %}
@@ -18,6 +18,7 @@ so-suricata:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-suricata:{{ GLOBALS.so_version }}
    - privileged: True
+    - restart_policy: unless-stopped
    - environment:
      - INTERFACE={{ GLOBALS.sensor.interface }}
      {% if DOCKERMERGED.containers['so-suricata'].extra_env %}
@@ -7,6 +7,7 @@ so-tcpreplay:
  docker_container.running:
    - network_mode: "host"
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-tcpreplay:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-tcpreplay
    - user: root
    - interactive: True
@@ -18,6 +18,7 @@ include:
 so-telegraf:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-telegraf:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - user: 939
    - group_add: 939,920
    - environment:
@@ -31,7 +31,6 @@
    'so_model': INIT.GRAINS.get('sosmodel',''),
    'sensoroni_key': INIT.PILLAR.sensoroni.config.sensoronikey,
    'os': INIT.GRAINS.os,
-    'os_version': INIT.GRAINS.osmajorrelease,
    'os_family': INIT.GRAINS.os_family,
    'application_urls': {},
    'manager_roles': [
@@ -18,6 +18,7 @@ so-zeek:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-zeek:{{ GLOBALS.so_version }}
    - start: True
    - privileged: True
+    - restart_policy: unless-stopped
    {% if DOCKERMERGED.containers['so-zeek'].ulimits %}
    - ulimits:
    {%   for ULIMIT in DOCKERMERGED.containers['so-zeek'].ulimits %}
@@ -903,14 +903,14 @@ detect_cloud() {

 detect_os() {
 	title "Detecting Base OS"
-	if [ -f /etc/oracle-release ] && grep -qE "release (9|10)\b" /etc/oracle-release; then
+	if [ -f /etc/redhat-release ] && grep -q "Red Hat Enterprise Linux release 9" /etc/redhat-release && [ -f /etc/oracle-release ]; then
 		OS=oracle
-		OSVER=$(grep -oE "release [0-9]+" /etc/oracle-release | grep -oE "[0-9]+")
+		OSVER=9
 		is_oracle=true
 		is_rpm=true
 		is_supported=true
 	else
-		info "This OS is not supported. Security Onion requires Oracle Linux 9 or 10."
+		info "This OS is not supported. Security Onion requires Oracle Linux 9."
 		fail_setup
 	fi

@@ -1783,15 +1783,6 @@ ensure_pyyaml() {
 # - securityonion/salt/salt/minion.defaults.yaml

 securityonion_repo() {
-	if [[ "$OSVER" == "10" ]]; then
-		# TEST PATH: Oracle Linux 10 uses the public OL10 + EPEL + Docker CE repos.
-		# Keep the stock /etc/yum.repos.d/* in place, skip the SO mirror and local reposync.
-		gpg_rpm_import
-		logCmd "dnf -y install oracle-epel-release-el10"
-		logCmd "dnf -y config-manager --add-repo https://download.docker.com/linux/rhel/docker-ce.repo"
-		logCmd "dnf repolist"
-		return
-	fi
 	# Remove all the current repos
 	logCmd "dnf -v clean all"
 	logCmd "mkdir -vp /root/oldrepos"
@@ -1886,19 +1877,12 @@ saltify() {
 	info "Installing Salt $SALTVERSION"
 	chmod u+x ../salt/salt/scripts/bootstrap-salt.sh

-	# Normally Salt packages come from the SO mirror, so -r disables the bootstrap's own repo setup.
-	# On the OL10 test path there is no SO mirror, so let bootstrap configure the public Salt repo.
-	local saltrepoflag="-r"
-	if [[ "$OSVER" == "10" ]]; then
-		saltrepoflag=""
-	fi
-
 	if [[ $waitforstate ]]; then
 		# install all for a manager
-		retry 30 10 "bash ../salt/salt/scripts/bootstrap-salt.sh $saltrepoflag -M -X stable $SALTVERSION" || fail_setup
+		retry 30 10 "bash ../salt/salt/scripts/bootstrap-salt.sh -r -M -X stable $SALTVERSION" || fail_setup
 	else
 		# just a minion
-		retry 30 10 "bash ../salt/salt/scripts/bootstrap-salt.sh $saltrepoflag -X stable $SALTVERSION" || fail_setup
+		retry 30 10 "bash ../salt/salt/scripts/bootstrap-salt.sh -r -X stable $SALTVERSION" || fail_setup
 	fi

 	salt_install_module_deps
Author	SHA1	Message	Date
Josh Patterson	487e433589	allow full highstate on manager while master locked	2026-06-02 13:58:38 -04:00
Josh Patterson	3328ff362d	add some logging	2026-06-02 10:44:17 -04:00
Josh Patterson	8c17ae0f66	move so-salt-minion-wait	2026-06-01 14:48:54 -04:00
Josh Patterson	f54939b444	Replace inotify pillar watch with postgres audit_settings beacon The active-push feature detected pillar/settings changes via an inotify beacon on the manager watching /opt/so/saltstack/local/pillar. Replace that pillar watch with a custom salt beacon (pillar_db) that polls the SOC so_soc.audit_settings table on a monotonic id watermark, so changes made through SOC drive immediate pushes from the database instead of the files. The suricata/strelka rule inotify watches (and pyinotify) are kept unchanged, since rule-file edits are not recorded in audit_settings. - salt/_beacons/pillar_db.py: new beacon. Polls audit_settings via `docker exec so-postgres psql` (unix-socket trust auth), tracks the last processed id in /opt/so/state/pillar_db_watch.id, seeds to MAX(id) on first run (no history replay), and emits one event per new row. - salt/reactor/push_pillar.sls: consume setting_id/node_id from the beacon event instead of a file path. App = first dotted segment of setting_id, looked up in pillar_push_map.yaml. Empty node_id -> grid-wide actions as is; populated node_id -> the app's state(s) retargeted to that one node. - salt/manager/files/beacons_pushstate.conf.jinja: drop the pillar inotify block, add the pillar_db beacon (interval = push.drain_interval); keep the suricata/strelka inotify watches. - salt/salt/files/reactor_pushstate.conf: map salt/beacon/*/pillar_db/ audit_settings to push_pillar.sls; remove the pillar inotify reactor lines; keep suricata/strelka. The intent -> so-push-drainer -> orch.push_batch pipeline is unchanged. Verified end-to-end on a standalone: a grid-wide telegraf.output change re-applied telegraf fleetwide (container replaced), and a per-host ntp.config.servers change applied ntp to only that node.	2026-05-29 14:55:13 -04:00
Josh Patterson	d48a22e37e	Merge pull request #15944 from Security-Onion-Solutions/jertel/wip Jertel/wip	2026-05-28 14:01:42 -04:00
Josh Patterson	9a70a06b3b	Merge remote-tracking branch 'origin/3/dev' into jertel/wip	2026-05-28 13:55:12 -04:00
Mike Reeves	526d739b3b	Merge pull request #15940 from Security-Onion-Solutions/TOoSmOotH-patch-4 Remove outdated HOTFIX version number	2026-05-28 10:25:28 -04:00
Mike Reeves	68d783e760	Remove outdated HOTFIX version number	2026-05-28 10:24:47 -04:00
Mike Reeves	1e9b6b0975	Merge pull request #15939 from Security-Onion-Solutions/3/main main to dev for hotfix	2026-05-28 10:24:21 -04:00
Josh Patterson	bb8ae91d91	fix so-soc postgres bootstrap	2026-05-27 16:39:52 -04:00
Josh Patterson	93ffce98d7	add onionconfig and postgres modules to soc config	2026-05-27 15:07:25 -04:00
Josh Patterson	79987f3659	bootstrap so-soc db in postgres during soup	2026-05-27 13:55:30 -04:00
Josh Patterson	16055c4d88	Merge remote-tracking branch 'origin/3/dev' into jertel/wip	2026-05-27 09:18:33 -04:00
Josh Patterson	6393d08e86	merge	2026-05-27 08:59:28 -04:00
Mike Reeves	ffd34d4e0e	Merge pull request #15919 from Security-Onion-Solutions/TOoSmOotH-patch-2 Add 3.2.0 option to discussion template	2026-05-21 15:58:28 -04:00
Mike Reeves	aa78978740	Add 3.2.0 option to discussion template	2026-05-21 15:57:57 -04:00
Mike Reeves	75d4f5e496	Merge pull request #15918 from Security-Onion-Solutions/TOoSmOotH-patch-1 Bump version from 3.1.0 to 3.2.0	2026-05-21 15:49:08 -04:00
Mike Reeves	89a28d2cfe	Bump version from 3.1.0 to 3.2.0	2026-05-21 15:45:58 -04:00
Josh Patterson	730c828bec	Merge remote-tracking branch 'origin/jertel/wip' into saltthangs	2026-05-19 10:23:45 -04:00
Jason Ertel	e45ad45d73	Merge branch '3/dev' into jertel/wip	2026-05-14 18:33:40 -04:00
Jason Ertel	907f699721	state rename	2026-05-14 11:03:08 -04:00
Jason Ertel	e7a7047f71	Merge branch '3/dev' into jertel/wip	2026-05-14 11:01:36 -04:00
Josh Patterson	b4e5171415	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-14 08:03:45 -04:00
Jason Ertel	936295f1c4	Merge branch '3/dev' into jertel/wip	2026-05-13 17:28:25 -04:00
Jason Ertel	61ca60a94c	prep for soc db config	2026-05-13 17:28:07 -04:00
Josh Patterson	84decc1db6	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-13 14:09:15 -04:00
Josh Patterson	7d4d6a0756	prune images if so-docker-prune exists	2026-05-08 10:13:15 -04:00
Josh Patterson	66c0a662fc	convert wait to script	2026-05-08 09:26:42 -04:00
Josh Patterson	778cc055ea	wait for salt-minion service to be ready before finishing state run	2026-05-07 17:01:20 -04:00
Josh Patterson	932deab751	update the push map	2026-05-07 10:51:53 -04:00
Josh Patterson	1281f0ee37	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-06 09:46:12 -04:00
Josh Patterson	f774334b6c	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-06 08:16:41 -04:00
Josh Patterson	7fcace34c4	add sensoroni to push map	2026-04-30 16:09:08 -04:00
Josh Patterson	9541024eb7	fix broken things	2026-04-30 15:35:24 -04:00
Josh Patterson	0d166ef732	remove trailing slashes	2026-04-30 09:53:00 -04:00
Josh Patterson	f7d2994f8b	filter temp files	2026-04-30 09:16:22 -04:00
Josh Patterson	8f0757606d	include salt..minion	2026-04-29 16:42:19 -04:00
Josh Patterson	0a8f2e01a0	install pyinotify	2026-04-29 16:41:56 -04:00
Josh Patterson	4546d7bc52	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-29 14:28:19 -04:00
Josh Patterson	17849d8758	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 15:49:22 -04:00
Josh Patterson	d3d30a587c	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 15:30:31 -04:00
Josh Patterson	034711d148	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 10:47:29 -04:00
Mike Reeves	a0cf0489d6	reduce highstate frequency with active push for rules and pillars - schedule highstate every 2 hours (was 15 minutes); interval lives in global:push:highstate_interval_hours so the SOC admin UI can tune it and so-salt-minion-check derives its threshold as (interval + 1) * 3600 - add inotify beacon on the manager + master reactor + orch.push_batch that writes per-app intent files, with a so-push-drainer schedule on the manager that debounces, dedupes, and dispatches a single orchestration - pillar_push_map.yaml allowlists the apps whose pillar changes trigger an immediate targeted state.apply (targets verified against salt/top.sls); edits under pillar/minions/ trigger a state.highstate on that one minion - host-batch every push orchestration (batch: 25%, batch_wait: 15) so rule changes don't thundering-herd large fleets - new global:push:enabled kill-switch tears down the beacon, reactor config, and drainer schedule on the next highstate for operators who want to keep highstate-only behavior - set restart_policy: unless-stopped on 23 container states so docker recovers crashes without waiting for the next highstate; leave registry (always), strelka/backend (on-failure), kratos, and hydra alone with inline comments explaining why	2026-04-10 15:43:16 -04:00
Jason Ertel	613d31c8a6	merge	2026-03-05 11:52:09 -05:00
@@ -1 +1 @@
 .1.0
 .2.0