Merge remote-tracking branch 'origin/3/dev' into soupmod

Merge pull request #15982 from Security-Onion-Solutions/reyesj2/wip
don't create stack trace when set -e is disabled
2026-06-23 02:38:09 +02:00 · 2026-06-22 09:41:16 -04:00 · 2026-06-18 15:25:41 -05:00 · 2026-06-18 14:56:29 -05:00 · 2026-06-18 14:28:09 -04:00 · 2026-06-17 16:47:49 -05:00
67 changed files with 2280 additions and 145 deletions
@@ -0,0 +1,142 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Custom salt beacon that watches the SOC audit_settings table in postgres for
+# new settings changes and emits a beacon event per new row. This replaces the
+# inotify watch on /opt/so/saltstack/local/pillar -- instead of monitoring pillar
+# files on disk, we monitor the so_soc.audit_settings table that SOC writes to.
+#
+# Detection is poll-based with a monotonic `id` watermark persisted to
+# WATERMARK_FILE: each pass selects rows with id greater than the last id seen,
+# which makes it self-healing (a missed poll simply catches up on the next one).
+#
+# Each emitted event carries setting_id and node_id; the push_pillar reactor maps
+# setting_id -> app via pillar_push_map.yaml and writes a push intent, after which
+# the existing so-push-drainer / orch.push_batch pipeline takes over unchanged.
+
+import logging
+import os
+import subprocess
+
+log = logging.getLogger(__name__)
+
+WATERMARK_FILE = '/opt/so/state/pillar_db_watch.id'
+CONTAINER = 'so-postgres'
+DATABASE = 'so_soc'
+
+# Unaligned, tuples-only psql output with a field separator that cannot appear in
+# an id/setting_id/node_id, so we can split each row reliably.
+FIELD_SEP = '\x1f'
+
+
+def __virtual__():
+    return True
+
+
+def validate(config):
+    return True, 'valid'
+
+
+def _read_watermark():
+    # Returns the last processed id, or None if the watermark has not been seeded.
+    try:
+        with open(WATERMARK_FILE, 'r') as f:
+            return int((f.read() or '').strip())
+    except (IOError, ValueError):
+        return None
+
+
+def _write_watermark(value):
+    try:
+        os.makedirs(os.path.dirname(WATERMARK_FILE), exist_ok=True)
+        tmp = WATERMARK_FILE + '.tmp'
+        with open(tmp, 'w') as f:
+            f.write(str(int(value)))
+        os.rename(tmp, WATERMARK_FILE)
+    except OSError:
+        log.exception('pillar_db beacon: failed to persist watermark to %s', WATERMARK_FILE)
+
+
+def _query(sql):
+    # Run a query against so_soc inside the so-postgres container over the unix
+    # socket (trust auth, no password). Returns stdout on success, or None on any
+    # failure so the caller can no-op and retry on the next interval.
+    cmd = [
+        'docker', 'exec', CONTAINER,
+        'psql', '-U', 'postgres', '-d', DATABASE,
+        '-tA', '-F', FIELD_SEP, '-c', sql,
+    ]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    except subprocess.TimeoutExpired:
+        log.warning('pillar_db beacon: psql timed out')
+        return None
+    except Exception:
+        log.exception('pillar_db beacon: failed to exec psql')
+        return None
+    if result.returncode != 0:
+        log.warning('pillar_db beacon: psql failed (rc=%s): %s',
+                    result.returncode, (result.stderr or '').strip())
+        return None
+    return result.stdout
+
+
+def beacon(config):
+    retval = []
+
+    watermark = _read_watermark()
+
+    # First run / missing watermark: seed to the current MAX(id) and emit nothing
+    # so we never replay the entire settings history into a fleetwide push.
+    if watermark is None:
+        seed = _query('SELECT COALESCE(MAX(id), 0) FROM audit_settings;')
+        if seed is None:
+            return retval  # postgres not ready yet; retry next interval
+        try:
+            _write_watermark(int((seed or '0').strip() or 0))
+        except ValueError:
+            log.warning('pillar_db beacon: could not parse MAX(id) seed: %r', seed)
+        return retval
+
+    rows = _query(
+        "SELECT id, setting_id, COALESCE(node_id, '') FROM audit_settings "
+        "WHERE id > %d ORDER BY id;" % watermark
+    )
+    if rows is None:
+        return retval
+
+    max_id = watermark
+    for line in rows.splitlines():
+        # Do NOT str.strip() the whole line: Python treats the \x1f field
+        # separator (and \x1c-\x1e) as whitespace, so stripping would eat an
+        # empty trailing node_id field and make the row look malformed.
+        if not line.strip():
+            continue
+        parts = line.split(FIELD_SEP)
+        if len(parts) < 3:
+            log.warning('pillar_db beacon: skipping malformed row: %r', line)
+            continue
+        try:
+            row_id = int(parts[0])
+        except ValueError:
+            log.warning('pillar_db beacon: skipping row with non-int id: %r', line)
+            continue
+        setting_id = parts[1]
+        node_id = parts[2]
+        retval.append({
+            'tag': 'audit_settings',
+            'id': row_id,
+            'setting_id': setting_id,
+            'node_id': node_id,
+        })
+        if row_id > max_id:
+            max_id = row_id
+
+    if max_id > watermark:
+        _write_watermark(max_id)
+        log.info('pillar_db beacon: emitted %d change(s), watermark %d -> %d',
+                 len(retval), watermark, max_id)
+
+    return retval
@@ -130,6 +130,17 @@ common_sbin:
      - so-pcap-import
 {% endif %}

+# Pin physical NIC names by MAC (run-once) so a kernel upgrade can't renumber the
+# interfaces SO binds by name. The marker keeps it a one-time setup; an admin can
+# pre-create the marker to opt out.
+pin_nic_names:
+  cmd.run:
+    - name: /usr/sbin/so-nic-pin
+    - unless: 'test -e /opt/so/state/nic_names_pinned'
+    - require:
+      - file: common_sbin
+      - file: statedir
+
 common_sbin_jinja:
  file.recurse:
    - name: /usr/sbin
@@ -142,6 +142,11 @@ check_elastic_license() {
 	fi  
 }

+check_elasticsearch_responsive() {
+    retry 3 15 "so-elasticsearch-query / --output /dev/null --fail" ||
+        fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
+}
+
 check_salt_master_status() {
 	local count=0
    local attempts="${1:- 10}"
@@ -0,0 +1,76 @@
+#!/bin/bash
+#
+# so-nic-pin — pin physical NIC names by permanent MAC via classic by-MAC udev
+#              rules, so a kernel upgrade can't renumber them.
+#
+# Security Onion binds its management and monitor interfaces BY NAME in pillar
+# (host:mainint, sensor:mainint, and bond0 is built on a specific physical NIC).
+# A kernel upgrade can change the kernel/systemd-udevd predictable-naming output
+# and renumber those NICs (e.g. enp1s0 -> enp2s0), which breaks the grid: the
+# pillar references a name that no longer exists and bond/bridge bring-up fails.
+#
+# This writes /etc/udev/rules.d/70-persistent-net.rules pinning each PHYSICAL NIC
+# to its CURRENT name by its PERMANENT MAC, freezing the names across future kernel
+# changes. It only writes the rules file; it does NOT live-trigger a rename (the
+# rules apply on the next boot/kernel, and a live rename would be disruptive).
+#
+# Run-once: gated by the drop file /opt/so/state/nic_names_pinned. If the marker is
+# present the script does nothing, so an admin can pre-create it to opt out. Invoked
+# from the common state on every highstate; the marker keeps it a one-time setup.
+
+NET_RULES_FILE="/etc/udev/rules.d/70-persistent-net.rules"
+MARKER="/opt/so/state/nic_names_pinned"
+
+log() { echo -e "[so-nic-pin] $*"; }
+
+# Echo "<name> <permanent-mac>" for every PHYSICAL NIC. A physical NIC is backed by a
+# real device (has device/driver), which excludes bond0/sobridge/docker0/veth*/lo whose
+# MACs are dynamic and must never be pinned. The PERMANENT MAC is used (ethtool -P, with
+# fallbacks), not the current one: an enslaved bond member's current MAC is rewritten to
+# the bond's, so matching on it would be wrong/ambiguous.
+physical_nics() {
+    local path n mac
+    for path in /sys/class/net/*; do
+        n="${path##*/}"
+        [ "$n" = "lo" ] && continue
+        [ -e "${path}/device/driver" ] || continue          # real device only
+        mac="$(ethtool -P "$n" 2>/dev/null | awk '/Permanent address/{print $NF}')"
+        case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/bonding_slave/perm_hwaddr" 2>/dev/null)" ;; esac
+        case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/address" 2>/dev/null)" ;; esac
+        case "$mac" in ""|00:00:00:00:00:00) continue ;; esac
+        echo "$n $mac"
+    done
+}
+
+# Turn "<name> <mac>" lines on stdin into classic by-MAC persistent-net udev rules.
+render_net_rules() {
+    echo "# Generated by so-nic-pin: pin NIC names by MAC so kernel upgrades can't renumber them."
+    echo "# Security Onion binds its management/monitor interfaces by name; do not hand-edit."
+    local n mac
+    while read -r n mac; do
+        [ -n "$n" ] || continue
+        printf 'SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", ATTR{address}=="%s", NAME="%s"\n' \
+            "$mac" "$n"
+    done
+}
+
+[ "$(id -u)" -eq 0 ] || exit 0                   # salt runs us as root; bail quietly otherwise
+[ -e "${MARKER}" ] && exit 0                      # run-once guard (mirrors the state's unless)
+
+nics="$(physical_nics)"
+if [ -z "${nics}" ]; then
+    log "no physical NICs detected — nothing to pin (will retry on next highstate)"
+    exit 0                                         # do NOT drop the marker; let it retry later
+fi
+
+log "pinning physical NICs by permanent MAC:"
+echo "${nics}" | sed 's/^/    /'
+
+[ -f "${NET_RULES_FILE}" ] && cp -f "${NET_RULES_FILE}" "${NET_RULES_FILE}.bak"
+echo "${nics}" | render_net_rules > "${NET_RULES_FILE}" || {
+    log "ERROR: failed to write ${NET_RULES_FILE}"
+    exit 1
+}
+
+mkdir -p "$(dirname "${MARKER}")" && touch "${MARKER}"
+log "wrote ${NET_RULES_FILE} ($(grep -c '^SUBSYSTEM' "${NET_RULES_FILE}") NIC(s) pinned); dropped ${MARKER}"
@@ -1,5 +1,3 @@
-{% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%}
-
 #!/bin/bash
 #
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
@@ -25,7 +23,8 @@ SYSTEM_START_TIME=$(date -d "$(</proc/uptime awk '{print $1}') seconds ago" +%s)
 LAST_HIGHSTATE_END=$([ -e "/opt/so/log/salt/lasthighstate" ] && date -r /opt/so/log/salt/lasthighstate +%s || echo 0)
 LAST_HEALTHCHECK_STATE_APPLY=$([ -e "/opt/so/log/salt/state-apply-test" ] && date -r /opt/so/log/salt/state-apply-test +%s || echo 0)
 # SETTING THRESHOLD TO ANYTHING UNDER 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
-THRESHOLD={{SALT_MINION_DEFAULTS.salt.minion.check_threshold}} #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
+# THRESHOLD is derived from the global push highstate interval + 1 hour, so the minion-check grace period tracks the schedule automatically.
+THRESHOLD=$(( ({{ salt['pillar.get']('global:push:highstate_interval_hours', 2) }} + 1) * 3600 )) #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
 THRESHOLD_DATE=$((LAST_HEALTHCHECK_STATE_APPLY+THRESHOLD))

 logCmd() {
@@ -9,7 +9,8 @@
 prune_images:
  cmd.run:
    - name: so-docker-prune
-    - order: last
+    - onlyif: command -v /usr/sbin/so-docker-prune >/dev/null 2>&1
+    - order: 9000

 {% else %}

@@ -19,6 +19,7 @@ wait_for_elasticsearch:
 so-elastalert:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastalert:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: elastalert
    - name: so-elastalert
    - user: so-elastalert
@@ -15,6 +15,7 @@ include:
 so-elastic-fleet-package-registry:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-fleet-package-registry:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-fleet-package-registry
    - hostname: Fleet-package-reg-{{ GLOBALS.hostname }}
    - detach: True
@@ -16,6 +16,7 @@ include:
 so-elastic-agent:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-agent
    - hostname: {{ GLOBALS.hostname }}
    - detach: True
@@ -9,7 +9,6 @@

 {% set CORE_ESFLEET_PACKAGES = ELASTICFLEETDEFAULTS.get('elasticfleet', {}).get('packages', {}) %}
 {% set ADDON_CONTENT_INTEGRATION_DEFAULTS = {} %}
-{% set DEBUG_STUFF = {} %}

 {% for pkg in ADDON_CONTENT_PACKAGE_COMPONENTS %}
 {%   if pkg.name in CORE_ESFLEET_PACKAGES %}
@@ -42,6 +42,7 @@ elasticagent_syncartifacts:
 so-elastic-fleet:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-fleet
    - hostname: FleetServer-{{ GLOBALS.hostname }}
    - detach: True
@@ -9,7 +9,6 @@

 {% set CORE_ESFLEET_PACKAGES = ELASTICFLEETDEFAULTS.get('elasticfleet', {}).get('packages', {}) %}
 {% set ADDON_INPUT_INTEGRATION_DEFAULTS = {} %}
-{% set DEBUG_STUFF = {} %}

 {% for pkg in ADDON_INPUT_PACKAGE_COMPONENTS %}
 {%   if pkg.name in CORE_ESFLEET_PACKAGES %}
@@ -116,7 +115,6 @@


 {%         do ADDON_INPUT_INTEGRATION_DEFAULTS.update({integration_key: integration_defaults}) %}
-{%         do DEBUG_STUFF.update({integration_key: "Generating defaults for "+ pkg.name })%}
 {%       endfor %}
 {%     endif %}
 {%   endif %}
@@ -133,6 +133,18 @@ so-elasticsearch-templates:
      - docker_container: so-elasticsearch
      - file: elasticsearch_sbin_jinja

+so-elasticsearch-dlm-apply:
+  cmd.run:
+    - name: /usr/sbin/so-elasticsearch-dlm-apply
+    - cwd: /opt/so
+    - require:
+      - docker_container: so-elasticsearch
+      - file: elasticsearch_sbin_jinja
+      - cmd: so-elasticsearch-templates
+    - retry:
+        attempts: 3
+        interval: 10
+
 so-elasticsearch-pipelines:
  cmd.run:
    - name: /usr/sbin/so-elasticsearch-pipelines {{ GLOBALS.hostname }}
@@ -153,7 +165,8 @@ so-elasticsearch-roles-load:
 {%    set ap = "absent" %}
 {%  endif %}
 {%  if grains.role in ['so-eval', 'so-standalone', 'so-heavynode'] %}
-{%    if ELASTICSEARCHMERGED.index_clean %}
+{#    Remove so-elasticsearch-indices-delete script when using DLM #}
+{%    if ELASTICSEARCHMERGED.index_clean and ELASTICSEARCHMERGED.data_retention_method == "ILM" %}
 {%      set ap = "present" %}
 {%    else %}
 {%      set ap = "absent" %}
@@ -2,6 +2,7 @@ elasticsearch:
  enabled: false
  version: 9.3.3
  index_clean: true
+  data_retention_method: DLM
  vm:
    max_map_count: 1048576
  config:
@@ -63,6 +64,8 @@ elasticsearch:
            verification_mode: none
  index_settings:
    global_overrides:
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        template:
          settings:
@@ -143,6 +146,8 @@ elasticsearch:
                order: desc
    so-common:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -304,6 +309,8 @@ elasticsearch:
              number_of_shards: 1
    so-assistant-chat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: ""
      index_template:
        composed_of:
        - assistant-chat-mappings
@@ -344,6 +351,8 @@ elasticsearch:
            min_age: 0ms
    so-assistant-session:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: ""
      index_template:
        composed_of:
        - assistant-session-mappings
@@ -497,6 +506,8 @@ elasticsearch:
            min_age: 30d
    so-idh:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -605,6 +616,8 @@ elasticsearch:
            min_age: 30d
    so-import:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -787,6 +800,8 @@ elasticsearch:
            min_age: 0ms
    so-kismet:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - kismet-mappings
@@ -836,6 +851,8 @@ elasticsearch:
            min_age: 30d
    so-kratos:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -904,6 +921,8 @@ elasticsearch:
            min_age: 30d
    so-hydra:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -1049,6 +1068,8 @@ elasticsearch:
            min_age: 0ms
    so-logs:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - so-data-streams-mappings
@@ -1129,6 +1150,8 @@ elasticsearch:
            min_age: 30d
    so-logs-detections_x_alerts:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - so-data-streams-mappings
@@ -1192,6 +1215,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1307,6 +1332,8 @@ elasticsearch:
            min_age: 30d
    so-elastic-agent-monitor:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1369,6 +1396,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_apm_server:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.apm_server@package
@@ -1433,6 +1462,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_auditbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.auditbeat@package
@@ -1497,6 +1528,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_cloudbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.cloudbeat@package
@@ -1561,6 +1594,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_endpoint_security:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1620,6 +1655,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_filebeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1679,6 +1716,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_fleet_server:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1735,6 +1774,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_heartbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.heartbeat@package
@@ -1799,6 +1840,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_metricbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1858,6 +1901,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_osquerybeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1917,6 +1962,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_packetbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.packetbeat@package
@@ -1981,6 +2028,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elasticsearch_x_server:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elasticsearch.server@package
@@ -2045,10 +2094,13 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_actions:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.actions@package
        - .logs-endpoint.actions@custom
+        - endpoint@custom
        - event-mappings
        - so-fleet_integrations.ip_mappings-1
        - so-fleet_globals-1
@@ -2058,8 +2110,9 @@ elasticsearch:
          hidden: false
        ignore_missing_component_templates:
        - .logs-endpoint.actions@custom
+        - endpoint@custom
        index_patterns:
-        - logs-endpoint.actions-*
+        - .logs-endpoint.actions-*
        priority: 501
        template:
          settings:
@@ -2104,10 +2157,13 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_action_x_responses:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.action.responses@package
        - .logs-endpoint.action.responses@custom
+        - endpoint@custom
        - event-mappings
        - so-fleet_integrations.ip_mappings-1
        - so-fleet_globals-1
@@ -2117,14 +2173,15 @@ elasticsearch:
          hidden: false
        ignore_missing_component_templates:
        - .logs-endpoint.action.responses@custom
+        - endpoint@custom
        index_patterns:
-        - logs-endpoint.action.responses-*
+        - .logs-endpoint.action.responses-*
        priority: 501
        template:
          settings:
            index:
              lifecycle:
-                name: so-logs-endpoint.actions-logs
+                name: so-logs-endpoint.action.responses-logs
              mapping:
                total_fields:
                  limit: 5000
@@ -2163,6 +2220,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_alerts:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.alerts@package
@@ -2222,6 +2281,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_diagnostic_x_collection:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.diagnostic.collection@package
@@ -2297,6 +2358,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_api:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.api@package
@@ -2356,6 +2419,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_file:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.file@package
@@ -2415,6 +2480,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_library:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.library@package
@@ -2474,6 +2541,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_network:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.network@package
@@ -2533,6 +2602,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_process:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.process@package
@@ -2592,6 +2663,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_registry:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.registry@package
@@ -2651,6 +2724,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_security:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.security@package
@@ -2710,6 +2785,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_heartbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.heartbeat@package
@@ -2769,6 +2846,8 @@ elasticsearch:
            min_age: 30d
    so-logs-http_endpoint_x_generic:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-http_endpoint.generic@package
@@ -2817,6 +2896,8 @@ elasticsearch:
            min_age: 30d
    so-logs-httpjson_x_generic:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-httpjson.generic@package
@@ -2882,6 +2963,8 @@ elasticsearch:
              number_of_replicas: 0
    so-logs-osquery-manager_x_action_x_responses:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        _meta:
          managed: true
@@ -2953,6 +3036,8 @@ elasticsearch:
              number_of_replicas: 0
    so-logs-osquery-manager_x_result:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        _meta:
          managed: true
@@ -3005,6 +3090,8 @@ elasticsearch:
            min_age: 30d
    so-logs-soc:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -3113,6 +3200,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_application:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3162,6 +3251,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_auth:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3211,6 +3302,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_security:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3260,6 +3353,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_syslog:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3309,6 +3404,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_system:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3358,6 +3455,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_forwarded:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.forwarded@package
@@ -3405,6 +3504,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_powershell:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.powershell@package
@@ -3452,6 +3553,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_powershell_operational:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.powershell_operational@package
@@ -3499,6 +3602,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_sysmon_operational:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.sysmon_operational@package
@@ -3546,6 +3651,8 @@ elasticsearch:
            min_age: 30d
    so-logs-winlog_x_winlog:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-winlog.winlog@package
@@ -3594,6 +3701,8 @@ elasticsearch:
            min_age: 30d
    so-logstash:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -3709,6 +3818,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-endpoint_x_metadata:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics-endpoint.metadata@package
@@ -3756,6 +3867,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-endpoint_x_metrics:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics-endpoint.metrics@package
@@ -3803,6 +3916,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-endpoint_x_policy:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics-endpoint.policy@package
@@ -3850,6 +3965,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-fleet_server_x_agent_status:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics@tsdb-settings
@@ -3874,6 +3991,8 @@ elasticsearch:
              number_of_replicas: 0
    so-metrics-fleet_server_x_agent_versions:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics@tsdb-settings
@@ -3898,6 +4017,8 @@ elasticsearch:
              number_of_replicas: 0
    so-redis:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -3958,13 +4079,10 @@ elasticsearch:
        - vulnerability-mappings
        - common-settings
        - common-dynamic-mappings
-        - logs-redis.log@package
-        - logs-redis.log@custom
        data_stream:
          allow_custom_routing: false
          hidden: false
-        ignore_missing_component_templates:
-        - logs-redis.log@custom
+        ignore_missing_component_templates: []
        index_patterns:
        - logs-redis.log*
        priority: 501
@@ -4016,6 +4134,8 @@ elasticsearch:
            min_age: 30d
    so-strelka:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4133,6 +4253,8 @@ elasticsearch:
            min_age: 30d
    so-suricata:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4249,6 +4371,8 @@ elasticsearch:
            min_age: 30d
    so-suricata_x_alerts:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4365,6 +4489,8 @@ elasticsearch:
            min_age: 30d
    so-syslog:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4481,6 +4607,8 @@ elasticsearch:
            min_age: 30d
    so-zeek:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -24,6 +24,7 @@ include:
 so-elasticsearch:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elasticsearch:{{ ELASTICSEARCHMERGED.version }}
+    - restart_policy: unless-stopped
    - hostname: elasticsearch
    - name: so-elasticsearch
    - user: elasticsearch
@@ -4,6 +4,13 @@ elasticsearch:
    forcedType: bool
    advanced: True
    helpLink: elasticsearch
+  data_retention_method:
+    description: Method for data retention. Options are ILM or DLM. For single node deployments and most distributed grid users, DLM will be the recommended option for simplified management. Those with more complex use cases may prefer ILM. The latter allows for more granular control, but requires more management overhead.
+    options:
+    - ILM
+    - DLM
+    forcedType: string
+    global: True
  version:
    description: "This specifies the version of the following containers: so-elastic-fleet-package-registry, so-elastic-agent, so-elastic-fleet, so-kibana, so-logstash and so-elasticsearch. Modifying this value in the Elasticsearch defaults.yaml will result in catastrophic grid failure."
    readonly: True
@@ -13,7 +20,7 @@ elasticsearch:
    description: Specify the memory heap size in (m)egabytes for Elasticsearch.
    helpLink: elasticsearch
  index_clean:
-    description: Determines if indices should be considered for deletion by available disk space in the cluster. Otherwise, indices will only be deleted by the age defined in the ILM settings. This setting only applies to EVAL, STANDALONE, and HEAVY NODE installations. Other installations can only use ILM settings.
+    description: Determines if indices should be considered for deletion by available disk space in the cluster. Otherwise, data is retained by the configured lifecycle settings. This setting only applies to EVAL, STANDALONE, and HEAVY NODE installations. Other installations use lifecycle settings only.
    forcedType: bool
    helpLink: elasticsearch
  vm:
@@ -139,6 +146,23 @@ elasticsearch:
    custom010: *pipelines
  index_settings:
    global_overrides:
+      data_stream_lifecycle:
+        data_retention:
+          description: |
+            The retention period for all data streams. Retention does not define the period that the data will be removed, but the minimum time period they will be kept.
+
+            Use a number followed by a time unit, such as 7d. Leave blank for indefinite retention where supported.
+
+            Configured retention period also affects the frequency of rolling over data streams.
+              - If retention is less than or equal to 1 day, max_age will be 1 hour
+              - If retention is less than or equal to 14 days, max_age will be 1 day
+              - If retention is less than or equal to 90 days, max_age will be 7 days
+              - If retention is greater than 90 days, max_age will be 30 days
+          forcedType: string
+          allowedNodeTypes:
+            - heavynode
+          regex: ^$|^[0-9]{1,5}(?:d|h|m|s)$
+          regexFailureMessage: Must be blank or a number followed by d, h, m, or s, such as 7d.
      index_template:
        template:
          settings:
@@ -311,13 +335,30 @@ elasticsearch:
              forcedType: string
              global: True
              helpLink: elasticsearch
-    so-logs: &indexSettings
+    so-logs: &dataStreamSettings
      index_sorting:
        description: Sorts the index by event time, at the cost of additional processing resource consumption.
        forcedType: bool
        global: True
        advanced: True
        helpLink: elasticsearch
+      data_stream_lifecycle:
+        data_retention:
+          description: |
+            The retention period for this data stream. Retention does not define the period that the data will be removed, but the minimum time period it will be kept.
+
+            Use a number followed by a time unit, such as 7d. Leave blank for indefinite retention where supported.
+
+            Configured retention period also affects the frequency of rolling over this data stream.
+              - If retention is less than or equal to 1 day, max_age will be 1 hour
+              - If retention is less than or equal to 14 days, max_age will be 1 day
+              - If retention is less than or equal to 90 days, max_age will be 7 days
+              - If retention is greater than 90 days, max_age will be 30 days
+          forcedType: string
+          allowedNodeTypes:
+            - heavynode
+          regex: ^$|^[0-9]{1,5}(?:d|h|m|s)$
+          regexFailureMessage: Must be blank or a number followed by d, h, m, or s, such as 7d.
      index_template:
        index_patterns:
          description: Patterns for matching multiple indices or tables.
@@ -335,6 +376,14 @@ elasticsearch:
                global: True
                advanced: True
                helpLink: elasticsearch
+              auto_expand_replicas:
+                description: Automatically expand the number of replicas based on the number of data nodes in the cluster. This can help ensure high availability as the cluster scales up or down.
+                forcedType: string
+                regex: "^(0-[1-9]|1-[2-9]|2-[3-9]|3-[4-9]|4-[5-9]|5-[6-9]|6-[7-9]|7-[89]|8-9|[0-9]-all|false)$"
+                regexFailureMessage: Must be in the format of "x-y" where x is minimum number of replicas and y is maximum number of replicas, or "0-all" to specify a minimum of 0 and no maximum, or "false" to disable automatic replica expansion.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
              mapping:
                total_fields:
                  limit:
@@ -596,65 +645,349 @@ elasticsearch:
            global: True
            advanced: True
            helpLink: elasticsearch
-    so-logs-system_x_auth: *indexSettings
-    so-logs-system_x_syslog: *indexSettings
-    so-logs-system_x_system: *indexSettings
-    so-logs-system_x_application: *indexSettings
-    so-logs-system_x_security: *indexSettings
-    so-logs-windows_x_forwarded: *indexSettings
-    so-logs-windows_x_powershell: *indexSettings
-    so-logs-windows_x_powershell_operational: *indexSettings
-    so-logs-windows_x_sysmon_operational: *indexSettings
-    so-logs-winlog_x_winlog: *indexSettings
-    so-logs-detections_x_alerts: *indexSettings
-    so-logs-http_endpoint_x_generic: *indexSettings
-    so-logs-httpjson_x_generic: *indexSettings
-    so-logs-osquery-manager-actions: *indexSettings
-    so-logs-osquery-manager-action_x_responses: *indexSettings
-    so-logs-osquery-manager_x_action_x_responses: *indexSettings
-    so-logs-osquery-manager_x_result: *indexSettings
-    so-logs-elastic_agent_x_apm_server: *indexSettings
-    so-logs-elastic_agent_x_auditbeat: *indexSettings
-    so-logs-elastic_agent_x_cloudbeat: *indexSettings
-    so-logs-elastic_agent_x_endpoint_security: *indexSettings
-    so-logs-endpoint_x_alerts: *indexSettings
-    so-logs-endpoint_x_events_x_api: *indexSettings
-    so-logs-endpoint_x_events_x_file: *indexSettings
-    so-logs-endpoint_x_events_x_library: *indexSettings
-    so-logs-endpoint_x_events_x_network: *indexSettings
-    so-logs-endpoint_x_events_x_process: *indexSettings
-    so-logs-endpoint_x_events_x_registry: *indexSettings
-    so-logs-endpoint_x_events_x_security: *indexSettings
-    so-logs-elastic_agent_x_filebeat: *indexSettings
-    so-logs-elastic_agent_x_fleet_server: *indexSettings
-    so-logs-elastic_agent_x_heartbeat: *indexSettings
-    so-logs-elastic_agent: *indexSettings
-    so-logs-elastic_agent_x_metricbeat: *indexSettings
-    so-logs-elastic_agent_x_osquerybeat: *indexSettings
-    so-logs-elastic_agent_x_packetbeat: *indexSettings
-    so-logs-elasticsearch_x_server: *indexSettings
-    so-metrics-endpoint_x_metadata: *indexSettings
-    so-metrics-endpoint_x_metrics: *indexSettings
-    so-metrics-endpoint_x_policy: *indexSettings
-    so-metrics-nginx_x_stubstatus: *indexSettings
-    so-metrics-vsphere_x_datastore: *indexSettings
-    so-metrics-vsphere_x_host: *indexSettings
-    so-metrics-vsphere_x_virtualmachine: *indexSettings
-    so-case: *indexSettings
-    so-common: *indexSettings
-    so-endgame: *indexSettings
-    so-idh: *indexSettings
-    so-suricata: *indexSettings
-    so-suricata_x_alerts: *indexSettings
-    so-import: *indexSettings
-    so-kratos: *indexSettings
-    so-hydra: *indexSettings
-    so-kismet: *indexSettings
-    so-logstash: *indexSettings
-    so-redis: *indexSettings
-    so-strelka: *indexSettings
-    so-syslog: *indexSettings
-    so-zeek: *indexSettings
+    so-logs-system_x_auth: *dataStreamSettings
+    so-logs-system_x_syslog: *dataStreamSettings
+    so-logs-system_x_system: *dataStreamSettings
+    so-logs-system_x_application: *dataStreamSettings
+    so-logs-system_x_security: *dataStreamSettings
+    so-logs-windows_x_forwarded: *dataStreamSettings
+    so-logs-windows_x_powershell: *dataStreamSettings
+    so-logs-windows_x_powershell_operational: *dataStreamSettings
+    so-logs-windows_x_sysmon_operational: *dataStreamSettings
+    so-logs-winlog_x_winlog: *dataStreamSettings
+    so-logs-detections_x_alerts: *dataStreamSettings
+    so-logs-http_endpoint_x_generic: *dataStreamSettings
+    so-logs-httpjson_x_generic: *dataStreamSettings
+    so-logs-osquery-manager-actions: *dataStreamSettings
+    so-logs-osquery-manager-action_x_responses: *dataStreamSettings
+    so-logs-osquery-manager_x_action_x_responses: *dataStreamSettings
+    so-logs-osquery-manager_x_result: *dataStreamSettings
+    so-logs-elastic_agent_x_apm_server: *dataStreamSettings
+    so-logs-elastic_agent_x_auditbeat: *dataStreamSettings
+    so-logs-elastic_agent_x_cloudbeat: *dataStreamSettings
+    so-logs-elastic_agent_x_endpoint_security: *dataStreamSettings
+    so-logs-endpoint_x_alerts: *dataStreamSettings
+    so-logs-endpoint_x_events_x_api: *dataStreamSettings
+    so-logs-endpoint_x_events_x_file: *dataStreamSettings
+    so-logs-endpoint_x_events_x_library: *dataStreamSettings
+    so-logs-endpoint_x_events_x_network: *dataStreamSettings
+    so-logs-endpoint_x_events_x_process: *dataStreamSettings
+    so-logs-endpoint_x_events_x_registry: *dataStreamSettings
+    so-logs-endpoint_x_events_x_security: *dataStreamSettings
+    so-logs-elastic_agent_x_filebeat: *dataStreamSettings
+    so-logs-elastic_agent_x_fleet_server: *dataStreamSettings
+    so-logs-elastic_agent_x_heartbeat: *dataStreamSettings
+    so-logs-elastic_agent: *dataStreamSettings
+    so-logs-elastic_agent_x_metricbeat: *dataStreamSettings
+    so-logs-elastic_agent_x_osquerybeat: *dataStreamSettings
+    so-logs-elastic_agent_x_packetbeat: *dataStreamSettings
+    so-logs-elasticsearch_x_server: *dataStreamSettings
+    so-metrics-endpoint_x_metadata: *dataStreamSettings
+    so-metrics-endpoint_x_metrics: *dataStreamSettings
+    so-metrics-endpoint_x_policy: *dataStreamSettings
+    so-metrics-nginx_x_stubstatus: *dataStreamSettings
+    so-metrics-vsphere_x_datastore: *dataStreamSettings
+    so-metrics-vsphere_x_host: *dataStreamSettings
+    so-metrics-vsphere_x_virtualmachine: *dataStreamSettings
+    so-common: *dataStreamSettings
+    so-endgame: *dataStreamSettings
+    so-idh: *dataStreamSettings
+    so-suricata: *dataStreamSettings
+    so-suricata_x_alerts: *dataStreamSettings
+    so-import: *dataStreamSettings
+    so-kratos: *dataStreamSettings
+    so-hydra: *dataStreamSettings
+    so-kismet: *dataStreamSettings
+    so-logstash: *dataStreamSettings
+    so-redis: *dataStreamSettings
+    so-strelka: *dataStreamSettings
+    so-syslog: *dataStreamSettings
+    so-zeek: *dataStreamSettings
+    # Managed SOC integration annotations are inserted below this line. Referencing '*dataStreamSettings'
+    so-case: &indexSettings
+      index_sorting:
+        description: Sorts the index by event time, at the cost of additional processing resource consumption.
+        forcedType: bool
+        global: True
+        advanced: True
+        helpLink: elasticsearch
+      index_template:
+        index_patterns:
+          description: Patterns for matching multiple indices or tables.
+          forcedType: "[]string"
+          multiline: True
+          global: True
+          advanced: True
+          helpLink: elasticsearch
+        template:
+          settings:
+            index:
+              number_of_replicas:
+                description: Number of replicas required for this index. Multiple replicas protects against data loss, but also increases storage costs.
+                forcedType: int
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              auto_expand_replicas:
+                description: Automatically expand the number of replicas based on the number of data nodes in the cluster. This can help ensure high availability as the cluster scales up or down.
+                forcedType: string
+                regex: "^(0-[1-9]|1-[2-9]|2-[3-9]|3-[4-9]|4-[5-9]|5-[6-9]|6-[7-9]|7-[89]|8-9|[0-9]-all|false)$"
+                regexFailureMessage: Must be in the format of "x-y" where x is minimum number of replicas and y is maximum number of replicas, or "0-all" to specify a minimum of 0 and no maximum, or "false" to disable automatic replica expansion.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              mapping:
+                total_fields:
+                  limit:
+                    description: Max number of fields that can exist on a single index. Larger values will consume more resources.
+                    global: True
+                    advanced: True
+                    helpLink: elasticsearch
+              refresh_interval:
+                description: Seconds between index refreshes. Shorter intervals can cause query performance to suffer since this is a synchronous and resource-intensive operation.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              number_of_shards:
+                description: Number of shards required for this index. Using multiple shards increases fault tolerance, but also increases storage and network costs.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              sort:
+                field:
+                  description: The field to sort by. Must set index_sorting to True.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+                order:
+                  description: The order to sort by. Must set index_sorting to True.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+          mappings:
+            _meta:
+              package:
+                name:
+                  description: Meta settings for the mapping.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              managed_by:
+                  description: Meta settings for the mapping.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              managed:
+                  description: Meta settings for the mapping.
+                  forcedType: bool
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+        composed_of:
+          description: The index template is composed of these component templates.
+          forcedType: "[]string"
+          global: True
+          advanced: True
+          helpLink: elasticsearch
+        priority:
+          description: The priority of the index template.
+          forcedType: int
+          global: True
+          advanced: True
+          helpLink: elasticsearch
+      policy:
+        phases:
+          hot:
+            min_age:
+              description: Minimum age of index. This determines when the index should be moved to the hot tier.
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+            actions:
+              set_priority:
+                priority:
+                  description: Priority of index. This is used for recovery after a node restart. Indices with higher priorities are recovered before indices with lower priorities.
+                  forcedType: int
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              rollover:
+                max_age:
+                  description: Maximum age of index.  Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+                max_primary_shard_size:
+                  description: Maximum primary shard size. Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              shrink:
+                method:
+                  description: Shrink the index to a new index with fewer primary shards. Shrink operation is by count or size.
+                  options:
+                  - COUNT
+                  - SIZE
+                  global: True
+                  advanced: True
+                  forcedType: string
+                number_of_shards:
+                  title: shard count
+                  description: Desired shard count. Note that this value is only used when the shrink method selected is 'COUNT'.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                max_primary_shard_size:
+                  title: max shard size
+                  description: Desired shard size in gb/tb/pb eg. 100gb. Note that this value is only used when the shrink method selected is 'SIZE'.
+                  regex: ^[0-9]+(?:gb|tb|pb)$
+                  global: True
+                  forcedType: string
+                  advanced: True
+                allow_write_after_shrink:
+                  description: Allow writes after shrink.
+                  global: True
+                  forcedType: bool
+                  default: False
+                  advanced: True
+              forcemerge:
+                max_num_segments:
+                  description: Reduce the number of segments in each index shard and clean up deleted documents.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                index_codec:
+                  title: compression
+                  description: Use higher compression for stored fields at the cost of slower performance.
+                  forcedType: bool
+                  global: True
+                  default: False
+                  advanced: True
+          warm:
+            min_age:
+              description: Minimum age of index. ex. 30d - This determines when the index should be moved to the warm tier. Nodes in the warm tier generally don’t need to be as fast as those in the hot tier. It’s important to note that this is calculated relative to the rollover date (NOT the original creation date of the index). For example, if you have an index that is set to rollover after 30 days and warm min_age set to 30 then there will be 30 days from index creation to rollover and then an additional 30 days before moving to warm tier.
+              regex: ^[0-9]{1,5}d$
+              forcedType: string
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+            actions:
+              set_priority:
+                priority:
+                  description: Priority of index. This is used for recovery after a node restart. Indices with higher priorities are recovered before indices with lower priorities.
+                  forcedType: int
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              rollover:
+                max_age:
+                  description: Maximum age of index.  Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+                max_primary_shard_size:
+                  description: Maximum primary shard size. Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              shrink:
+                method:
+                  description: Shrink the index to a new index with fewer primary shards. Shrink operation is by count or size.
+                  options:
+                  - COUNT
+                  - SIZE
+                  global: True
+                  advanced: True
+                number_of_shards:
+                  title: shard count
+                  description: Desired shard count. Note that this value is only used when the shrink method selected is 'COUNT'.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                max_primary_shard_size:
+                  title: max shard size
+                  description: Desired shard size in gb/tb/pb eg. 100gb. Note that this value is only used when the shrink method selected is 'SIZE'.
+                  regex: ^[0-9]+(?:gb|tb|pb)$
+                  global: True
+                  forcedType: string
+                  advanced: True
+                allow_write_after_shrink:
+                  description: Allow writes after shrink.
+                  global: True
+                  forcedType: bool
+                  default: False
+                  advanced: True
+              forcemerge:
+                max_num_segments:
+                  description: Reduce the number of segments in each index shard and clean up deleted documents.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                index_codec:
+                  title: compression
+                  description: Use higher compression for stored fields at the cost of slower performance.
+                  forcedType: bool
+                  global: True
+                  default: False
+                  advanced: True
+              allocate:
+                number_of_replicas:
+                  description: Set the number of replicas. Remains the same as the previous phase by default.
+                  forcedType: int
+                  global: True
+                  advanced: True
+          cold:
+            min_age:
+              description: Minimum age of index. ex. 60d - This determines when the index should be moved to the cold tier.  While still searchable, this tier is typically optimized for lower storage costs rather than search speed. It’s important to note that this is calculated relative to the rollover date (NOT the original creation date of the index). For example, if you have an index that is set to rollover after 30 days and cold min_age set to 60 then there will be 30 days from index creation to rollover and then an additional 60 days before moving to cold tier.
+              regex: ^[0-9]{1,5}d$
+              forcedType: string
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+            actions:
+              set_priority:
+                priority:
+                  description: Used for index recovery after a node restart. Indices with higher priorities are recovered before indices with lower priorities.
+                  forcedType: int
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              allocate:
+                number_of_replicas:
+                  description: Set the number of replicas. Remains the same as the previous phase by default.
+                  forcedType: int
+                  global: True
+                  advanced: True
+          delete:
+            min_age:
+              description: Minimum age of index. ex. 90d - This determines when the index should be deleted. It’s important to note that this is calculated relative to the rollover date (NOT the original creation date of the index). For example, if you have an index that is set to rollover after 30 days and delete min_age set to 90 then there will be 30 days from index creation to rollover and then an additional 90 days before deletion.
+              regex: ^[0-9]{1,5}d$
+              forcedType: string
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+        _meta:
+          package:
+            name:
+              description: Meta settings for the mapping.
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+          managed_by:
+            description: Meta settings for the mapping.
+            global: True
+            advanced: True
+            helpLink: elasticsearch
+          managed:
+            description: Meta settings for the mapping.
+            forcedType: bool
+            global: True
+            advanced: True
+            helpLink: elasticsearch
+    sos-backup: *indexSettings
+    so-detection: *indexSettings
+    so-assistant-chat: *indexSettings
+    so-assistant-session: *indexSettings
    so-metrics-fleet_server_x_agent_status: &fleetMetricsSettings
      index_sorting:
        description: Sorts the index by event time, at the cost of additional processing resource consumption.
@@ -4,7 +4,11 @@
   Elastic License 2.0. #}

 {% import_yaml 'elasticsearch/defaults.yaml' as ELASTICSEARCHDEFAULTS %}
+{# ELASTICSEARCHMERGED only used here to collect data_retention_method. This file intentionally works with ELASTICSEARCHDEFAULTS #}
+{% from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
+
 {% set DEFAULT_GLOBAL_OVERRIDES = ELASTICSEARCHDEFAULTS.elasticsearch.index_settings.pop('global_overrides') %}
+{% set DATA_RETENTION_METHOD = ELASTICSEARCHMERGED.data_retention_method %}

 {% set PILLAR_GLOBAL_OVERRIDES = {} %}
 {% set ES_INDEX_PILLAR = salt['pillar.get']('elasticsearch:index_settings', {}) %}
@@ -105,6 +109,17 @@
 {%     if not settings.get('index_sorting', False) | to_bool and settings.index_template.template.settings.index.sort is defined %}
 {%       do settings.index_template.template.settings.index.pop('sort') %}
 {%     endif %}
+{%     if DATA_RETENTION_METHOD == 'DLM' and settings.index_template.data_stream is defined and settings.data_stream_lifecycle is defined %}
+{%       if settings.data_stream_lifecycle.data_retention is defined and settings.data_stream_lifecycle.data_retention %}
+{%         do settings.index_template.template.update({'lifecycle': {'data_retention': settings.data_stream_lifecycle.data_retention}}) %}
+{%       else %}
+{%         do settings.index_template.template.update({'lifecycle': {}}) %}
+{%       endif %}
+{%       if settings.index_template.template.settings.index.lifecycle is not defined %}
+{%         do settings.index_template.template.settings.index.update({'lifecycle': {}}) %}
+{%       endif %}
+{%       do settings.index_template.template.settings.index.lifecycle.update({'prefer_ilm': false}) %}
+{%     endif %}
 {%   endif %}

 {# advanced ilm actions #}
@@ -125,14 +125,6 @@ load_component_templates() {
    done
 }

-check_elasticsearch_responsive() {
-    # Cannot load templates if Elasticsearch is not responding.
-    #  NOTE: Slightly faster exit w/ failure than previous "retry 240 1" if there is a problem with Elasticsearch the
-    #    script should exit sooner rather than hang at the 'so-elasticsearch-templates' salt state.
-    retry 3 15 "so-elasticsearch-query / --output /dev/null --fail" ||
-        fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
-}
-
 index_templates_exist() {
    local templates_dir="$1"

@@ -0,0 +1,175 @@
+#!/bin/bash
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+{%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
+{%- set DATA_RETENTION_METHOD = ELASTICSEARCHMERGED.data_retention_method %}
+
+ELASTICSEARCH_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR:-/opt/so/conf/elasticsearch/templates}"
+TEMPLATE_DIRS=(
+    "${ELASTICSEARCH_TEMPLATES_DIR}/index"
+    "${ELASTICSEARCH_TEMPLATES_DIR}/addon-index"
+)
+DATA_RETENTION_METHOD=$(cat <<'EOF'
+{{ DATA_RETENTION_METHOD }}
+EOF
+)
+DLM_FAILURES=0
+DLM_FAILURE_NAMES=()
+
+if [[ "$DATA_RETENTION_METHOD" != "DLM" && "$DATA_RETENTION_METHOD" != "ILM" ]]; then
+    echo "Unsupported data retention method $DATA_RETENTION_METHOD. Expected DLM or ILM."
+    exit 1
+fi
+
+validate_template_file() {
+    local template_file="$1"
+
+    if ! jq -e 'type == "object" and (.data_stream == null or (.data_stream | type == "object")) and (.template.lifecycle == null or (.template.lifecycle | type == "object")) and (.template.lifecycle.data_retention == null or (.template.lifecycle.data_retention | type == "string"))' >/dev/null 2>&1 "$template_file"; then
+        echo "Invalid index template JSON: $template_file"
+        return 1
+    fi
+}
+
+is_data_stream_template() {
+    jq -e '.data_stream | type == "object"' >/dev/null 2>&1 "$1"
+}
+
+has_data_stream_lifecycle() {
+    jq -e '.template.lifecycle | type == "object"' >/dev/null 2>&1 "$1"
+}
+
+get_data_retention() {
+    jq -r '.template.lifecycle.data_retention // ""' "$1"
+}
+
+find_template_file() {
+    local template="$1"
+    local template_dir
+    local template_file
+
+    for template_dir in "${TEMPLATE_DIRS[@]}"; do
+        template_file="${template_dir}/${template}-template.json"
+
+        if [[ -f "$template_file" ]]; then
+            echo "$template_file"
+            return 0
+        fi
+    done
+
+    return 1
+}
+
+set_data_stream_lifecycle() {
+    local data_stream="$1"
+    local data_retention="$2"
+    local body
+    local output
+
+    if [[ -n "$data_retention" ]]; then
+        if jq -e --arg data_stream "$data_stream" --arg data_retention "$data_retention" '.data_streams[]? | select(.name == $data_stream and .lifecycle.enabled == true and .lifecycle.data_retention == $data_retention)' >/dev/null 2>&1 <<< "$data_streams"; then
+            echo "DLM lifecycle already set for $data_stream with data_retention $data_retention, skipping."
+            return 0
+        fi
+    elif jq -e --arg data_stream "$data_stream" '.data_streams[]? | select(.name == $data_stream and .lifecycle.enabled == true and (.lifecycle.data_retention == null))' >/dev/null 2>&1 <<< "$data_streams"; then
+        echo "DLM lifecycle already set for $data_stream with indefinite retention, skipping."
+        return 0
+    fi
+
+    if [[ -n "$data_retention" ]]; then
+        body=$(jq -cn --arg data_retention "$data_retention" '{data_retention: $data_retention}')
+    else
+        # Setting indefinite retention
+        body='{}'
+    fi
+
+    if ! output=$(so-elasticsearch-query "_data_stream/${data_stream}/_lifecycle" -XPUT -d "$body" --retry 3 --retry-delay 5 --fail); then
+        echo "Failed to set data stream lifecycle for $data_stream."
+        return 1
+    fi
+
+    if [[ -n "$data_retention" ]]; then
+        echo "Set DLM lifecycle for $data_stream with data_retention $data_retention."
+    else
+        echo "Set DLM lifecycle for $data_stream with indefinite retention."
+    fi
+}
+
+disable_data_stream_lifecycle() {
+    local data_stream="$1"
+    local body='{"enabled":false}'
+    local output
+
+    if ! jq -e --arg data_stream "$data_stream" '.data_streams[]? | select(.name == $data_stream and .lifecycle != null and .lifecycle.enabled != false)' >/dev/null 2>&1 <<< "$data_streams"; then
+        # No action needed
+        return 0
+    fi
+
+    if ! output=$(so-elasticsearch-query "_data_stream/${data_stream}/_lifecycle" -XPUT -d "$body" --retry 3 --retry-delay 5 --fail); then
+        echo "Failed to disable data stream lifecycle for $data_stream."
+        return 1
+    fi
+
+    echo "Disabled DLM lifecycle for $data_stream."
+}
+
+process_data_stream() {
+    local data_stream="$1"
+    local data_retention="$2"
+
+    if [[ "$DATA_RETENTION_METHOD" == "DLM" ]]; then
+        set_data_stream_lifecycle "$data_stream" "$data_retention"
+    else
+        disable_data_stream_lifecycle "$data_stream"
+    fi
+}
+
+check_elasticsearch_responsive
+
+if ! data_streams=$(so-elasticsearch-query "_data_stream?format=json" --retry 3 --retry-delay 5 --fail); then
+    echo "Failed to retrieve data streams."
+    exit 1
+fi
+
+while read -r data_stream_config; do
+    data_stream=$(jq -r '.name' <<< "$data_stream_config")
+    template=$(jq -r '.template' <<< "$data_stream_config")
+
+    if ! template_file=$(find_template_file "$template"); then
+        echo "Skipping $data_stream: index template file not found for $template."
+        continue
+    fi
+
+    validate_template_file "$template_file" || exit 1
+
+    if ! is_data_stream_template "$template_file"; then
+        echo "Skipping $data_stream: $template_file is not a data stream template."
+        continue
+    fi
+
+    if [[ "$DATA_RETENTION_METHOD" == "DLM" ]] && ! has_data_stream_lifecycle "$template_file"; then
+        echo "Skipping $data_stream: $template_file does not define data stream lifecycle."
+        continue
+    fi
+
+    data_retention=$(get_data_retention "$template_file")
+
+    if ! process_data_stream "$data_stream" "$data_retention"; then
+        DLM_FAILURES=$((DLM_FAILURES + 1))
+        DLM_FAILURE_NAMES+=("$data_stream")
+    fi
+done < <(jq -c '.data_streams[]' <<< "$data_streams")
+
+if [[ $DLM_FAILURES -eq 0 ]]; then
+    echo "Data stream lifecycle updates completed successfully."
+else
+    echo "Encountered $DLM_FAILURES failure(s) updating data stream lifecycle:"
+    for failed_data_stream in "${DLM_FAILURE_NAMES[@]}"; do
+        echo "  - $failed_data_stream"
+    done
+    exit 1
+fi
@@ -1,3 +1,10 @@
 global:
  pcapengine: SURICATA
-  pipeline: REDIS
+  pipeline: REDIS
+  push:
+    enabled: true
+    highstate_interval_hours: 2
+    debounce_seconds: 30
+    drain_interval: 15
+    batch: '25%'
+    batch_wait: 15
@@ -59,4 +59,41 @@ global:
    description: Allows use of Endgame with Security Onion. This feature requires a license from Endgame.
    global: True
    advanced: True
+  push:
+    enabled:
+      description: Master kill-switch for the active push feature. When disabled, rule and pillar changes are picked up at the next scheduled highstate instead of being pushed immediately.
+      forcedType: bool
+      helpLink: push
+      global: True
+    highstate_interval_hours:
+      description: How often every minion in the grid runs a scheduled state.highstate, in hours. Lower values keep minions closer in sync at the cost of more load; higher values reduce load but increase worst-case latency for non-pushed changes. The salt-minion health check restarts a minion if its last highstate is older than this value plus one hour.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    debounce_seconds:
+      description: Trailing-edge debounce window in seconds. A push intent must be quiet for this long before the drainer dispatches. Rapid bursts of edits within this window coalesce into one dispatch.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    drain_interval:
+      description: How often the push drainer checks for ready intents, in seconds. Small values lower dispatch latency at the cost of more background work on the manager.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    batch:
+      description: "Host batch size for push orchestrations. A number (e.g. '10') or a percentage (e.g. '25%'). Limits how many minions run the push state at once so large fleets don't thundering-herd."
+      helpLink: push
+      global: True
+      advanced: True
+      regex: '^([0-9]+%?)$'
+      regexFailureMessage: Enter a whole number or a whole-number percentage (e.g. 10 or 25%).
+    batch_wait:
+      description: Seconds to wait between host batches in a push orchestration. Gives the fleet time to breathe between waves.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True

@@ -58,6 +58,7 @@ so-hydra:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: hydraconfig
@@ -15,6 +15,7 @@ include:
 so-idh:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-idh:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-idh
    - detach: True
    - network_mode: host
@@ -18,6 +18,7 @@ include:
 so-influxdb:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-influxdb:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: influxdb
    - networks:
      - sobridge:
@@ -27,6 +27,7 @@ include:
 so-kafka:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kafka:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-kafka
    - name: so-kafka
    - networks:
@@ -17,6 +17,7 @@ include:
 so-kibana:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kibana:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: kibana
    - user: "932:0"
    - networks:
@@ -51,6 +51,7 @@ so-kratos:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: kratosschema
@@ -28,6 +28,7 @@ include:
 so-logstash:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-logstash:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-logstash
    - name: so-logstash
    - networks:
@@ -0,0 +1,21 @@
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'global/map.jinja' import GLOBALMERGED %}
+
+include:
+  - salt.minion
+
+{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
+salt_beacons_pushstate:
+  file.managed:
+    - name: /etc/salt/minion.d/beacons_pushstate.conf
+    - source: salt://manager/files/beacons_pushstate.conf.jinja
+    - template: jinja
+    - watch_in:
+      - service: salt_minion_service
+{% else %}
+salt_beacons_pushstate:
+  file.absent:
+    - name: /etc/salt/minion.d/beacons_pushstate.conf
+    - watch_in:
+      - service: salt_minion_service
+{% endif %}
@@ -0,0 +1,41 @@
+{% from 'global/map.jinja' import GLOBALMERGED %}
+beacons:
+  pillar_db:
+    - interval: {{ GLOBALMERGED.push.drain_interval }}
+    - disable_during_state_run: True
+  inotify:
+    - disable_during_state_run: True
+    - coalesce: True
+    - files:
+        /opt/so/saltstack/local/salt/suricata/rules:
+          mask:
+            - close_write
+            - moved_to
+            - delete
+          recurse: True
+          auto_add: True
+          exclude:
+            - '\.sw[a-z]$':
+                regex: True
+            - '~$':
+                regex: True
+            - '/4913$':
+                regex: True
+            - '/\.#':
+                regex: True
+        /opt/so/saltstack/local/salt/strelka/rules/compiled:
+          mask:
+            - close_write
+            - moved_to
+            - delete
+          recurse: True
+          auto_add: True
+          exclude:
+            - '\.sw[a-z]$':
+                regex: True
+            - '~$':
+                regex: True
+            - '/4913$':
+                regex: True
+            - '/\.#':
+                regex: True
@@ -15,6 +15,7 @@ include:
  - manager.elasticsearch
  - manager.kibana
  - manager.managed_soc_annotations
+  - manager.beacons

 repo_log_dir:
  file.directory:
@@ -231,6 +232,7 @@ surifiltersrules:
    - user: 939
    - group: 939

+
 {% else %}

 {{sls}}_state_not_allowed:
@@ -16,40 +16,35 @@
 {%       endif %}
 {%     endfor %}
 {%   endfor %}
+{%   set soc_annotation_lines = [] %}
+{%   set defaults_lines = [] %}
+{%   for k in matched_integration_names %}
+{%     do soc_annotation_lines.append('    ' ~ k ~ ': *dataStreamSettings') %}
+{%     do defaults_lines.append('    ' ~ k ~ ':') %}
+{%     set defaults_yaml = salt['slsutil.serialize']('yaml', ADDON_INTEGRATION_DEFAULTS[k], default_flow_style=False).strip() %}
+{%     for line in defaults_yaml.splitlines() %}
+{%       do defaults_lines.append('      ' ~ line) %}
+{%     endfor %}
+{%   endfor %}
 {%   set es_soc_annotations = '/opt/so/saltstack/default/salt/elasticsearch/soc_elasticsearch.yaml' %}
-{{   es_soc_annotations }}:
-     file.serialize:
-       - dataset:
-           {% set data = salt['file.read'](es_soc_annotations) | load_yaml %}
-           {% set es = data.get('elasticsearch', {}) %}
-           {% set index_settings = es.get('index_settings', {}) %}
-           {% set input = index_settings.get('so-logs', {}) %}
-           {% for k in matched_integration_names %}
-           {%   do index_settings.update({k: input}) %}
-           {% endfor %}
-           {% for k in addon_integration_keys %}
-           {%   if k not in matched_integration_names and k in index_settings %}
-           {%     do index_settings.pop(k) %}
-           {%   endif %}
-           {% endfor %}
-           {{ data }}
+manage_soc_annotations:
+  file.blockreplace:
+    - name: {{ es_soc_annotations }}
+    - marker_start: '    # START managed SOC integration annotations'
+    - marker_end: '    # END managed SOC integration annotations'
+    - content: {{ soc_annotation_lines | join('\n') | tojson }}
+    - insert_after_match: '^    # Managed SOC integration annotations are inserted below this line\.'
+    - append_if_not_found: False
+    - show_changes: True

 {#   Managed elasticsearch/defaults.yaml file for enabling 'Revert to default' via SOC UI for newly added config items #}
 {%   set es_defaults = '/opt/so/saltstack/default/salt/elasticsearch/defaults.yaml' %}
 {{   es_defaults }}:
-     file.serialize:
-       - dataset:
-           {% set data = salt['file.read'](es_defaults) | load_yaml %}
-           {% set es = data.get('elasticsearch', {}) %}
-           {% set index_settings = es.get('index_settings', {}) %}
-           {% for k in matched_integration_names %}
-           {%   set input = ADDON_INTEGRATION_DEFAULTS[k] %}
-           {%     do index_settings.update({k: input})%}
-           {% endfor %}
-           {% for k in addon_integration_keys %}
-           {%   if k not in matched_integration_names and k in index_settings %}
-           {%     do index_settings.pop(k) %}
-           {%   endif %}
-           {% endfor %}
-           {{ data }}
-{% endif %}
+  file.blockreplace:
+    - marker_start: '    # START managed SOC integration defaults'
+    - marker_end: '    # END managed SOC integration defaults'
+    - content: {{ defaults_lines | join('\n') | tojson }}
+    - insert_after_match: '^  index_settings:$'
+    - append_if_not_found: False
+    - show_changes: True
+{% endif %}
@@ -0,0 +1,232 @@
+#!/opt/saltstack/salt/bin/python3
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+"""
+so-push-drainer
+===============
+
+Scheduled drainer for the active-push feature. Runs on the manager every
+drain_interval seconds (default 15) via a salt schedule in salt/schedule.sls.
+
+For each intent file under /opt/so/state/push_pending/*.json whose last_touch
+is older than debounce_seconds, this script:
+  * concatenates the actions lists from every ready intent
+  * dedupes by (state or __highstate__, tgt, tgt_type)
+  * dispatches a single `salt-run state.orchestrate orch.push_batch --async`
+    with the deduped actions list passed as pillar kwargs
+  * deletes the contributed intent files on successful dispatch
+
+Reactor sls files (push_suricata, push_strelka, push_pillar) write intents
+but never dispatch directly -- see plan
+/home/mreeves/.claude/plans/goofy-marinating-hummingbird.md for the full design.
+"""
+
+import fcntl
+import glob
+import json
+import logging
+import logging.handlers
+import os
+import subprocess
+import sys
+import time
+
+import salt.client
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+LOG_FILE = '/opt/so/log/salt/so-push-drainer.log'
+
+HIGHSTATE_SENTINEL = '__highstate__'
+
+
+def _make_logger():
+    logger = logging.getLogger('so-push-drainer')
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+        handler = logging.handlers.RotatingFileHandler(
+            LOG_FILE, maxBytes=5 * 1024 * 1024, backupCount=3,
+        )
+        handler.setFormatter(logging.Formatter(
+            '%(asctime)s | %(levelname)s | %(message)s',
+        ))
+        logger.addHandler(handler)
+    return logger
+
+
+def _load_push_cfg():
+    """Read the global:push pillar subtree via salt-call. Returns a dict."""
+    caller = salt.client.Caller()
+    cfg = caller.cmd('pillar.get', 'global:push', {})
+    return cfg if isinstance(cfg, dict) else {}
+
+
+def _read_intent(path, log):
+    try:
+        with open(path, 'r') as f:
+            return json.load(f)
+    except (IOError, ValueError) as exc:
+        log.warning('cannot read intent %s: %s', path, exc)
+        return None
+    except Exception:
+        log.exception('unexpected error reading %s', path)
+        return None
+
+
+def _dedupe_actions(actions):
+    seen = set()
+    deduped = []
+    for action in actions:
+        if not isinstance(action, dict):
+            continue
+        state_key = HIGHSTATE_SENTINEL if action.get('highstate') else action.get('state')
+        tgt = action.get('tgt')
+        tgt_type = action.get('tgt_type', 'compound')
+        if not state_key or not tgt:
+            continue
+        key = (state_key, tgt, tgt_type)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(action)
+    return deduped
+
+
+def _dispatch(actions, log):
+    pillar_arg = json.dumps({'actions': actions})
+    cmd = [
+        'salt-run',
+        'state.orchestrate',
+        'orch.push_batch',
+        'pillar={}'.format(pillar_arg),
+        '--async',
+    ]
+    log.info('dispatching: %s', ' '.join(cmd[:3]) + ' pillar=<{} actions>'.format(len(actions)))
+    try:
+        result = subprocess.run(
+            cmd, check=True, capture_output=True, text=True, timeout=60,
+        )
+    except subprocess.CalledProcessError as exc:
+        log.error('dispatch failed (rc=%s): stdout=%s stderr=%s',
+                  exc.returncode, exc.stdout, exc.stderr)
+        return False
+    except subprocess.TimeoutExpired:
+        log.error('dispatch timed out after 60s')
+        return False
+    except Exception:
+        log.exception('dispatch raised')
+        return False
+    log.info('dispatch accepted: %s', (result.stdout or '').strip())
+    return True
+
+
+def main():
+    log = _make_logger()
+
+    if not os.path.isdir(PENDING_DIR):
+        # Nothing to do; reactors create the dir on first use.
+        return 0
+
+    try:
+        push = _load_push_cfg()
+    except Exception:
+        log.exception('failed to read global:push pillar; aborting drain pass')
+        return 1
+
+    if not push.get('enabled', True):
+        log.debug('push disabled; exiting')
+        return 0
+
+    debounce_seconds = int(push.get('debounce_seconds', 30))
+
+    os.makedirs(PENDING_DIR, exist_ok=True)
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent_files = [
+            p for p in sorted(glob.glob(os.path.join(PENDING_DIR, '*.json')))
+            if os.path.basename(p) != '.lock'
+        ]
+        if not intent_files:
+            return 0
+
+        now = time.time()
+        ready = []
+        skipped = 0
+        broken = []
+        for path in intent_files:
+            intent = _read_intent(path, log)
+            if not isinstance(intent, dict):
+                broken.append(path)
+                continue
+            last_touch = intent.get('last_touch', 0)
+            if now - last_touch < debounce_seconds:
+                skipped += 1
+                continue
+            ready.append((path, intent))
+
+        for path in broken:
+            try:
+                os.unlink(path)
+            except OSError:
+                pass
+
+        if not ready:
+            if skipped:
+                log.debug('no ready intents (%d still in debounce window)', skipped)
+            return 0
+
+        combined_actions = []
+        oldest_first_touch = now
+        all_paths = []
+        for path, intent in ready:
+            combined_actions.extend(intent.get('actions', []) or [])
+            first = intent.get('first_touch', now)
+            if first < oldest_first_touch:
+                oldest_first_touch = first
+            all_paths.extend(intent.get('paths', []) or [])
+
+        deduped = _dedupe_actions(combined_actions)
+        if not deduped:
+            log.warning('%d intent(s) had no usable actions; clearing', len(ready))
+            for path, _ in ready:
+                try:
+                    os.unlink(path)
+                except OSError:
+                    pass
+            return 0
+
+        debounce_duration = now - oldest_first_touch
+        log.info(
+            'draining %d intent(s): %d action(s) after dedupe (raw=%d), '
+            'debounce_duration=%.1fs, paths=%s',
+            len(ready), len(deduped), len(combined_actions),
+            debounce_duration, all_paths[:20],
+        )
+
+        if not _dispatch(deduped, log):
+            log.warning('dispatch failed; leaving intent files in place for retry')
+            return 1
+
+        for path, _ in ready:
+            try:
+                os.unlink(path)
+            except OSError:
+                log.exception('failed to remove drained intent %s', path)
+
+        return 0
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
@@ -16,6 +16,7 @@ POSTVERSION=$INSTALLEDVERSION
 INSTALLEDSALTVERSION=$(salt --versions-report | grep Salt: | awk '{print $2}')
 BATCHSIZE=5
 SOUP_LOG=/root/soup.log
+SOUP_DEBUG_LOG=/root/soup-debug.log
 WHATWOULDYOUSAYYAHDOHERE=soup
 whiptail_title='Security Onion UPdater'
 NOTIFYCUSTOMELASTICCONFIG=false
@@ -34,6 +35,7 @@ if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
 fi
 # used to display messages to the user at the end of soup
 declare -a FINAL_MESSAGE_QUEUE=()
+SOUP_ERR_CONTEXT=


 check_err() {
@@ -114,11 +116,52 @@ check_err() {
      echo "$err_msg"
    fi

+    if [[ -n $SOUP_ERR_CONTEXT ]]; then
+      echo ""
+      printf '%s\n' "$SOUP_ERR_CONTEXT"
+    fi
+
+    echo "SOUP XTRACE debug log (if enabled) at $SOUP_DEBUG_LOG. Re-run soup with SOUP_DEBUG=1 to create $SOUP_DEBUG_LOG"
+
    exit $exit_code
  fi

 }

+# Collect bash error context before passing off to check_err()
+on_err() {
+  local exit_code=$?
+  # Ignore failures in blocks that explicitly disabled errexit with `set +e`.
+  [[ $- == *e* ]] || return $exit_code
+  # turn off xtrace to prevent added noise in debug log
+  set +x 2>/dev/null || true
+
+  # Use first error context, multiple errors can happen with command substitutions or nested functions. We just need context from the initial error.
+  [[ -n $SOUP_ERR_CONTEXT ]] && return $exit_code
+
+  local cmd=$BASH_COMMAND
+  local line=${BASH_LINENO[0]}
+  local function=${FUNCNAME[1]:-main}
+  local source=${BASH_SOURCE[1]##*/}
+  local -a err_lines=(
+    "ERROR on: ${cmd}"
+    "  source: ${source}:${line} in ${function}()"
+  )
+  local i caller_line caller_src caller_func
+
+  for ((i=2; i<${#FUNCNAME[@]}-1; i++)); do
+    caller_line=${BASH_LINENO[$((i-1))]}
+    [[ -n $caller_line && $caller_line -gt 0 ]] || continue
+    caller_src=${BASH_SOURCE[$i]##*/}
+    caller_func=${FUNCNAME[$i]:-main}
+    err_lines+=("  called by: ${caller_src}:${caller_line} in ${caller_func}()")
+  done
+
+  SOUP_ERR_CONTEXT=$(printf '%s\n' "${err_lines[@]}")
+
+  return $exit_code
+}
+
 airgap_mounted() {
  # Let's see if the ISO is already mounted.
  if [[ -f /tmp/soagupdate/SecurityOnion/VERSION ]]; then
@@ -343,10 +386,11 @@ highstate() {
 masterlock() {
  echo "Locking Salt Master"
  mv -v $TOPFILE $BACKUPTOPFILE
-  echo "base:" > $TOPFILE
-  echo "  $MINIONID:" >> $TOPFILE
-  echo "    - ca" >> $TOPFILE
-  echo "    - elasticsearch" >> $TOPFILE
+  # Render the real top file only for the host running soup; every other
+  # minion gets an empty top (no states) while the master is upgrading.
+  echo "{% if grains['id'] == '$MINIONID' %}" > $TOPFILE
+  cat $BACKUPTOPFILE >> $TOPFILE
+  echo "{% endif %}" >> $TOPFILE
 }

 masterunlock() {
@@ -761,9 +805,56 @@ bootstrap_so_soc_database() {
  echo "so_soc bootstrap complete."
 }

+# Existing grids should keep ILM unless an admin explicitly opts in to DLM.
+pin_elasticsearch_data_retention_method() {
+  local elasticsearch_file=/opt/so/saltstack/local/pillar/elasticsearch/soc_elasticsearch.sls
+  mkdir -p "$(dirname "$elasticsearch_file")"
+  [[ -f "$elasticsearch_file" ]] || touch "$elasticsearch_file"
+
+  if so-yaml.py get -r "$elasticsearch_file" elasticsearch.data_retention_method >/dev/null 2>&1; then
+    echo "elasticsearch.data_retention_method already set; leaving as-is."
+    return 0
+  fi
+
+  echo "Pinning existing grid to ILM data retention."
+  so-yaml.py add "$elasticsearch_file" elasticsearch.data_retention_method ILM
+  chown socore:socore "$elasticsearch_file"
+}
+
+# Addes auto_expand_replicas setting to .kibana_streams index template
+#
+# In Kibana 9.3.3 the auto_expand_replicas setting was not added to the .kibana_streams index template. Causing single node deployments to be stuck in yellow state (unable to assign replica). Here we update the template in place using the so_kibana system user (system managed index template) to include the auto_expand_replicas setting
+#
+# Reference: https://github.com/elastic/kibana/issues/263048
+kibana_backport_streams_index_template() {
+    local current_template updated_template
+
+    set +e
+    if ! current_template=$(so-elasticsearch-query "_index_template/.kibana_streams" --retry 3 --retry-delay 5 --fail); then
+        echo "Index template .kibana_streams does not exist, skipping backport."
+        return 0
+    fi
+    set -e
+
+    updated_template=$(jq '.index_templates[0].index_template | .template.settings += {"index.auto_expand_replicas": "0-1"} | del(.created_date_millis, .modified_date_millis)' <<< "$current_template")
+
+    if ! kibana_user_pass=$(/usr/sbin/so-yaml.py get -r /opt/so/saltstack/local/pillar/elasticsearch/auth.sls elasticsearch.auth.users.so_kibana_user.pass); then
+        echo "Unable to retrieve so_kibana_user password, skipping .kibana_streams index template backport."
+        return 0
+    fi
+
+    if ! so-elasticsearch-query "_index_template/.kibana_streams" -XPUT -d "$updated_template" -u "so_kibana:$kibana_user_pass" --retry 3 --retry-delay 5 --fail; then
+        echo "Unable to automatically update .kibana_streams index template"
+        return 0
+    fi
+
+}
+
 up_to_3.2.0() {
  fix_logstash_0013_lumberjack_pipeline_name

+  pin_elasticsearch_data_retention_method
+
  INSTALLEDVERSION=3.2.0
 }

@@ -774,6 +865,8 @@ post_to_3.2.0() {
  echo "Regenerating Elastic Agent Installers"
  /sbin/so-elastic-agent-gen-installers

+  kibana_backport_streams_index_template
+
  POSTVERSION=3.2.0
 }

@@ -1757,6 +1850,9 @@ main() {

    enable_highstate

+    echo "salt-call state.show_top"
+    salt-call state.show_top
+
    echo ""
    echo "Running a highstate. This could take several minutes."
    set +e
@@ -1764,6 +1860,9 @@ main() {
    highstate
    set -e

+    echo "salt-call saltutil.running"
+    salt-call saltutil.running
+
    stop_salt_master

    masterunlock
@@ -1786,6 +1885,9 @@ main() {
    # ensure the mine is updated and populated before highstates run, following the salt-master restart
    update_salt_mine

+    echo "salt-call state.show_top"
+    salt-call state.show_top
+
    highstate
    check_saltmaster_status
    postupgrade_changes
@@ -1933,4 +2035,20 @@ EOF
  read -r input
 fi

-main "$@" | tee -a $SOUP_LOG
+set -o errtrace
+trap on_err ERR
+
+if [[ $SOUP_DEBUG == 1 ]]; then
+  if [ -f $SOUP_DEBUG_LOG ]; then
+    current_time=$(date +%Y%m%d.%H%M%S)
+    mv $SOUP_DEBUG_LOG $SOUP_DEBUG_LOG.$INSTALLEDVERSION.$current_time
+  fi
+  exec {SOUP_XTRACE_FD}>>"$SOUP_DEBUG_LOG"
+  export SOUP_XTRACE_FD
+  BASH_XTRACEFD=$SOUP_XTRACE_FD
+  PS4='+ [${BASH_SOURCE##*/}:${LINENO} ${FUNCNAME[0]:-main}()] | '
+  set -x
+  export SOUP_DEBUG
+fi
+
+main "$@" 2>&1 | tee -a $SOUP_LOG
@@ -34,6 +34,7 @@ make-rule-dir-nginx:
 so-nginx:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-nginx:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-nginx
    - networks:
      - sobridge:
@@ -0,0 +1,37 @@
+{% from 'global/map.jinja' import GLOBALMERGED %}
+{% set actions = salt['pillar.get']('actions', []) %}
+{% set BATCH = GLOBALMERGED.push.batch %}
+{% set BATCH_WAIT = GLOBALMERGED.push.batch_wait %}
+
+{% for action in actions %}
+{%   if action.get('highstate') %}
+apply_highstate_{{ loop.index }}:
+  salt.state:
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+    - highstate: True
+    - batch: {{ action.get('batch', BATCH) }}
+    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
+    - kwarg:
+        queue: 2
+{%   else %}
+refresh_pillar_{{ loop.index }}:
+  salt.function:
+    - name: saltutil.refresh_pillar
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+
+apply_{{ action.state | replace('.', '_') }}_{{ loop.index }}:
+  salt.state:
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+    - sls:
+      - {{ action.state }}
+    - batch: {{ action.get('batch', BATCH) }}
+    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
+    - kwarg:
+        queue: 2
+    - require:
+      - salt: refresh_pillar_{{ loop.index }}
+{%   endif %}
+{% endfor %}
@@ -0,0 +1,240 @@
+# One pillar directory can map to multiple (state, tgt) actions.
+# tgt is a raw salt compound expression. tgt_type is always "compound".
+# Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s).
+# An action with `highstate: True` triggers state.highstate instead of
+# state.apply -- see salt/orch/push_batch.sls.
+#
+# Notes:
+#   - `bpf` is a pillar-only dir (no state of its own) consumed by both
+#     zeek and suricata via macros, so a bpf pillar change re-applies both.
+#   - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have
+#     their own pillar dirs AND their own state, so they map 1:1 (or 1:2
+#     in strelka's case, because of the split init.sls / manager.sls).
+#
+# Intentional omissions (these will log a "not in pillar_push_map.yaml"
+# warning in push_pillar.sls and wait for the next scheduled highstate):
+#   - `data` and `node_data`: pillar-only data consumed by many states;
+#     handling them generically would amount to a fleetwide highstate.
+#   - `host`: soc_host describes mainint/mainip; a change is a re-IP and
+#     needs a coordinated procedure, not an immediate state push.
+#   - `hypervisor`: state changes touch libvirt and are disruptive; leave
+#     to the next scheduled highstate.
+#   - `sensor`: every field in soc_sensor.yaml is `readonly: True` or
+#     per-minion (`node: True`). Per-minion edits are persisted under
+#     pillar/minions/<id>.sls and are handled by Branch A of push_pillar.sls
+#     (per-minion highstate intent), not by this app-pillar map.
+#
+# The role sets here were verified line-by-line against salt/top.sls. If
+# salt/top.sls changes how an app is targeted, update the corresponding
+# compound here.
+
+# firewall: the one pillar everyone touches. Applied everywhere intentionally
+# because every host's iptables needs to know about every other host in the
+# grid. Salt's firewall state is idempotent (file.managed + iptables-restore
+# onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is
+# unchanged do a file comparison and no-op without touching iptables -- actual
+# reload happens only on the hosts whose rules actually changed. Fleetwide
+# blast radius is intentional and matches the pre-plan behavior via highstate.
+# Adding N sensors in a burst coalesces into one dispatch via the drainer.
+firewall:
+  - state: firewall
+    tgt: '*'
+
+# backup: backup.config_backup runs on eval, standalone, manager, managerhype,
+# managersearch (NOT import -- the backup pillar is included on import per
+# pillar/top.sls but the backup state is not run there per salt/top.sls).
+backup:
+  - state: backup.config_backup
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# bpf is pillar-only (no state); consumed by both zeek and suricata as macros.
+# Both states run on sensor_roles + so-import per salt/top.sls.
+bpf:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# ca is applied universally.
+ca:
+  - state: ca
+    tgt: '*'
+
+# docker: universal. The docker state is in both the all-non-managers and
+# all-managers branches of salt/top.sls.
+docker:
+  - state: docker
+    tgt: '*'
+
+# elastalert: eval, standalone, manager, managerhype, managersearch (NOT import).
+elastalert:
+  - state: elastalert
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elastic-fleet-package-registry: manager_roles exactly.
+elastic-fleet-package-registry:
+  - state: elastic-fleet-package-registry
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elasticsearch: 8 roles.
+elasticsearch:
+  - state: elasticsearch
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone'
+
+# elasticagent: so-heavynode only.
+elasticagent:
+  - state: elasticagent
+    tgt: 'G@role:so-heavynode'
+
+# elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid
+# is a deploy/enrollment step, not a config reload; leave it to the next highstate.
+elasticfleet:
+  - state: elasticfleet
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# global: fanout to a fleetwide highstate. The global pillar (soc_global.sls)
+# carries cross-cutting settings (pipeline, url_base, imagerepo, mdengine, ...)
+# that are consumed by virtually every state, so a targeted re-apply isn't
+# meaningful. The drainer's batch/batch_wait throttling controls blast radius.
+global:
+  - highstate: True
+    tgt: '*'
+
+# healthcheck: eval, sensor, standalone only.
+healthcheck:
+  - state: healthcheck
+    tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone'
+
+# hydra: manager_roles exactly.
+hydra:
+  - state: hydra
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# idh: so-idh only.
+idh:
+  - state: idh
+    tgt: 'G@role:so-idh'
+
+# influxdb: manager_roles exactly.
+influxdb:
+  - state: influxdb
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kafka: standalone, manager, managerhype, managersearch, searchnode, receiver.
+kafka:
+  - state: kafka
+    tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# kibana: manager_roles exactly.
+kibana:
+  - state: kibana
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kratos: manager_roles exactly.
+kratos:
+  - state: kratos
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# logrotate: universal (top-of-file '*' branch in salt/top.sls).
+logrotate:
+  - state: logrotate
+    tgt: '*'
+
+# logstash: 8 roles, no eval/import.
+logstash:
+  - state: logstash
+    tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# manager: manager_roles exactly. The manager state is also referenced under
+# *_sensor / *_heavynode top.sls blocks via `sensor`, but the standalone
+# `manager` state itself runs only on manager_roles.
+manager:
+  - state: manager
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop.
+nginx:
+  - state: nginx
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# ntp: universal (top-of-file '*' branch in salt/top.sls).
+ntp:
+  - state: ntp
+    tgt: '*'
+
+# patch: universal. soc_patch carries the OS update schedule, applied via
+# patch.os.schedule on every node (it's in both the all-non-managers and
+# all-managers branches of salt/top.sls).
+patch:
+  - state: patch.os.schedule
+    tgt: '*'
+
+# postgres: manager_roles exactly.
+postgres:
+  - state: postgres
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver.
+# (NOT eval, NOT import, NOT searchnode.)
+redis:
+  - state: redis
+    tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone'
+
+# registry: manager_roles exactly.
+registry:
+  - state: registry
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# sensoroni: universal.
+sensoroni:
+  - state: sensoroni
+    tgt: '*'
+
+# soc: manager_roles exactly.
+soc:
+  - state: soc
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# stig: broad. Runs on standalone, manager, managerhype, managersearch,
+# searchnode, sensor, receiver, fleet, hypervisor, desktop.
+# NOT eval, NOT import, NOT heavynode, NOT idh (the *_idh block in
+# salt/top.sls intentionally omits stig).
+stig:
+  - state: stig
+    tgt: 'G@role:so-desktop or G@role:so-fleet or G@role:so-hypervisor or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is
+# intentionally NOT fired on pillar changes -- YARA rule and strelka config
+# pillar changes are consumed by the sensor-side strelka backend, and re-running
+# strelka.manager on managers is both unnecessary and disruptive. strelka.manager
+# is left to the 2-hour highstate.
+strelka:
+  - state: strelka
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone'
+
+# suricata: sensor_roles + so-import (5 roles).
+suricata:
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# telegraf: universal.
+telegraf:
+  - state: telegraf
+    tgt: '*'
+
+# versionlock: universal (top-of-file '*' branch in salt/top.sls).
+versionlock:
+  - state: versionlock
+    tgt: '*'
+
+# vm: libvirt-driver hypervisors only. Matched by the salt-cloud:driver:libvirt
+# grain (compound supports nested grain matching via G@<key>:<subkey>:<value>).
+# pillar/vm/soc_vm.sls write path is referenced at salt/_runners/setup_hypervisor.py:856.
+vm:
+  - state: vm
+    tgt: 'G@salt-cloud:driver:libvirt'
+
+# zeek: sensor_roles + so-import (5 roles).
+zeek:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
@@ -0,0 +1,176 @@
+#!py
+
+# Reactor invoked by the pillar_db beacon when SOC records settings changes in
+# the so_soc.audit_settings table (see salt/_beacons/pillar_db.py). The beacon
+# emits one event per new row carrying setting_id and node_id.
+#
+# Two branches, keyed on node_id:
+#   A) node_id populated -> the change is scoped to that one minion. Look up the
+#      app in pillar_push_map.yaml and write an intent that runs the app's mapped
+#      state(s) targeted to just that node.
+#   B) node_id empty -> grid-wide app change. Look up the app in
+#      pillar_push_map.yaml and write an intent with the entry's actions as-is.
+#
+# The app name is the first dotted segment of setting_id (e.g. "telegraf.output"
+# -> "telegraf"), which matches the pillar_push_map.yaml keys 1:1.
+#
+# Reactors never dispatch directly. The so-push-drainer schedule picks up
+# ready intents, dedupes across pending files, and dispatches orch.push_batch.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+import yaml
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the
+# master, which mounts the default saltstack tree at this path.
+PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml'
+
+_PUSH_MAP_CACHE = {'mtime': 0, 'data': None}
+
+
+def _load_push_map():
+    try:
+        st = os.stat(PUSH_MAP_PATH)
+    except OSError:
+        LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH)
+        return {}
+    if _PUSH_MAP_CACHE['mtime'] != st.st_mtime:
+        try:
+            with open(PUSH_MAP_PATH, 'r') as f:
+                _PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {}
+        except Exception:
+            LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH)
+            _PUSH_MAP_CACHE['data'] = {}
+        _PUSH_MAP_CACHE['mtime'] = st.st_mtime
+    return _PUSH_MAP_CACHE['data'] or {}
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_pillar: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_pillar: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def _app_from_setting(setting_id):
+    # setting_id is e.g. 'telegraf.output' -> 'telegraf', 'ntp.config.servers' -> 'ntp'
+    if not setting_id:
+        return None
+    return setting_id.split('.', 1)[0] or None
+
+
+def _node_actions(entry, node_id):
+    # Copy the app's mapped actions but retarget each one to the single node.
+    # Preserves the state/highstate selection and any batch/batch_wait overrides.
+    actions = []
+    for action in entry:
+        if not isinstance(action, dict):
+            continue
+        node_action = dict(action)
+        node_action['tgt'] = node_id
+        node_action['tgt_type'] = 'glob'
+        actions.append(node_action)
+    return actions
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_pillar: push disabled, skipping')
+        return {}
+
+    # The pillar_db beacon nests its payload under data['data']; fall back to the
+    # top level so the reactor is robust to either shape.
+    event = data.get('data', data)  # noqa: F821 -- data provided by reactor
+    setting_id = event.get('setting_id', '')
+    node_id = (event.get('node_id') or '').strip()
+
+    app = _app_from_setting(setting_id)
+    if not app:
+        LOG.debug('push_pillar: ignoring event with no app segment: setting_id=%s', setting_id)
+        return {}
+
+    push_map = _load_push_map()
+    entry = push_map.get(app)
+    if not entry:
+        LOG.warning(
+            'push_pillar: app "%s" is not in pillar_push_map.yaml; change will be '
+            'picked up at the next scheduled highstate (setting_id=%s)',
+            app, setting_id,
+        )
+        return {}
+
+    # Branch A: per-node change -> retarget the app's states to just that node.
+    if node_id:
+        actions = _node_actions(entry, node_id)
+        if not actions:
+            LOG.warning('push_pillar: no usable actions for app "%s" (setting_id=%s)', app, setting_id)
+            return {}
+        _write_intent(
+            'node_{}_{}'.format(node_id, app), actions,
+            'audit:{}@{}'.format(setting_id, node_id),
+        )
+        LOG.info('push_pillar: per-node intent updated for %s on %s (setting_id=%s)',
+                 app, node_id, setting_id)
+        return {}
+
+    # Branch B: grid-wide app change -> use the map entry's actions as-is.
+    actions = list(entry)  # copy to avoid mutating the cache
+    _write_intent('pillar_{}'.format(app), actions, 'audit:{}'.format(setting_id))
+    LOG.info('push_pillar: app intent updated for %s (setting_id=%s)', app, setting_id)
+    return {}
@@ -0,0 +1,96 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/strelka/rules/compiled/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side
+# strelka runs on exactly these four roles; so-import gets strelka.manager
+# instead, which is not fired on pillar changes.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES)
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_strelka: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_strelka: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_strelka: push disabled, skipping')
+        return {}
+
+    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'strelka', 'tgt': _sensor_compound()}]
+    _write_intent('rules_strelka', actions, path)
+    LOG.info('push_strelka: intent updated for path=%s', path)
+    return {}
@@ -0,0 +1,95 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/suricata/rules/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also
+# runs on so-import per salt/top.sls, so that role is appended below.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound_plus_import():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import'
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_suricata: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_suricata: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_suricata: push disabled, skipping')
+        return {}
+
+    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}]
+    _write_intent('rules_suricata', actions, path)
+    LOG.info('push_suricata: intent updated for path=%s', path)
+    return {}
@@ -17,6 +17,7 @@ include:
 so-redis:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-redis
    - user: socore
    - networks:
@@ -21,6 +21,9 @@ so-dockerregistry:
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-dockerregistry'].ip }}
+    # Intentionally `always` (not unless-stopped) -- registry is critical infra
+    # and must come back up even if it was manually stopped. Do not homogenize
+    # to unless-stopped; see the container auto-restart section of the plan.
    - restart_policy: always
    - port_bindings:
      {% for BINDING in DOCKERMERGED.containers['so-dockerregistry'].port_bindings %}
@@ -3,7 +3,7 @@
 {% set SCHEDULE = salt['pillar.get']('healthcheck:schedule', 30) %}

 include:
-  - salt
+  - salt.minion

 {% if CHECKS and ENABLED %}
 salt_beacons:
@@ -14,12 +14,13 @@ salt_beacons:
    - defaults:
        CHECKS: {{ CHECKS }}
        SCHEDULE: {{ SCHEDULE }}
-    - watch_in: 
+    - watch_in:
      - service: salt_minion_service
 {% else %}
 salt_beacons:
  file.absent:
    - name: /etc/salt/minion.d/beacons.conf
-    - watch_in: 
+    - watch_in:
      - service: salt_minion_service
 {% endif %}
+
@@ -0,0 +1,11 @@
+reactor:
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules':
+    - salt://reactor/push_suricata.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules/*':
+    - salt://reactor/push_suricata.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled':
+    - salt://reactor/push_strelka.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled/*':
+    - salt://reactor/push_strelka.sls
+  - 'salt/beacon/*/pillar_db/audit_settings':
+    - salt://reactor/push_pillar.sls
@@ -5,3 +5,11 @@ salt_bootstrap:
    - source: salt://salt/scripts/bootstrap-salt.sh
    - mode: 755
    - show_changes: False
+
+salt_sbin:
+  file.recurse:
+    - name: /usr/sbin
+    - source: salt://salt/tools/sbin
+    - user: 939
+    - group: 939
+    - file_mode: 755
@@ -1,4 +1,4 @@
 lasthighstate:
  file.touch:
    - name: /opt/so/log/salt/lasthighstate
-    - order: last
+    - order: 9001
@@ -10,10 +10,12 @@
 #    software that is protected by the license key."

 {% from 'allowed_states.map.jinja' import allowed_states %}
+{% from 'global/map.jinja' import GLOBALMERGED %}
 {% if sls in allowed_states %}

 include:
  - salt.minion
+  - salt.master.pyinotify
  - salt.master.boot_mine_update
 {%   if 'vrt' in salt['pillar.get']('features', []) %}
  - salt.cloud
@@ -63,6 +65,21 @@ engines_config:
    - name: /etc/salt/master.d/engines.conf
    - source: salt://salt/files/engines.conf

+{% if GLOBALMERGED.push.enabled %}
+reactor_pushstate_config:
+  file.managed:
+    - name: /etc/salt/master.d/reactor_pushstate.conf
+    - source: salt://salt/files/reactor_pushstate.conf
+    - watch_in:
+      - service: salt_master_service
+{% else %}
+reactor_pushstate_config:
+  file.absent:
+    - name: /etc/salt/master.d/reactor_pushstate.conf
+    - watch_in:
+      - service: salt_master_service
+{% endif %}
+
 # update the bootstrap script when used for salt-cloud
 salt_bootstrap_cloud:
  file.managed:
@@ -78,7 +95,7 @@ salt_master_service:
      - file: checkmine_engine
      - file: pillarWatch_engine
      - file: engines_config
-    - order: last
+    - order: 9002

 {% else %}

@@ -0,0 +1,20 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+pyinotify_module_package:
+  file.recurse:
+    - name: /opt/so/conf/salt/module_packages/pyinotify
+    - source: salt://salt/module_packages/pyinotify
+    - clean: True
+    - makedirs: True
+
+pyinotify_python_module_install:
+  cmd.run:
+    - name: /opt/saltstack/salt/bin/python3.10 -m pip install pyinotify --no-index --find-links=/opt/so/conf/salt/module_packages/pyinotify/ --upgrade
+    - onchanges:
+      - file: pyinotify_module_package
+    - failhard: True
+    - watch_in:
+      - service: salt_minion_service
@@ -2,4 +2,3 @@
 salt:
  minion:
    version: '3006.19'
-    check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
@@ -111,13 +111,17 @@ mark_setup_complete_for_upgrades:

 {% endif %}

-# this has to be outside the if statement above since there are <requisite>_in calls to this state
+# this has to be outside the if statement above since there are <requisite>_in calls to this state.
+# uses watch (not listen) so the restart fires in-state and its result lands on this state's
+# running entry; that is what lets wait_for_salt_minion_ready below detect any restart
+# uniformly via onchanges, regardless of whether the trigger came from these files or from
+# external watch_in's (e.g. beacons, master/pyinotify).
 salt_minion_service:
  service.running:
    - name: salt-minion
    - enable: True
    - onlyif: test "{{INSTALLEDSALTVERSION}}" == "{{SALTVERSION}}"
-    - listen:
+    - watch:
      - file: mine_functions
 {% if INSTALLEDSALTVERSION|string == SALTVERSION|string %}
      - file: set_log_levels
@@ -126,3 +130,17 @@ salt_minion_service:
      - file: signing_policy
 {% endif %}
    - order: last
+
+# block until the just-restarted salt-minion is back and can execute modules locally, so
+# follow-on jobs and the next highstate iteration do not race the restart. onchanges +
+# require on salt_minion_service catches every restart trigger uniformly because watch
+# mod_watch results replace the service state's running entry. wait logic lives in
+# /usr/sbin/so-salt-minion-wait (deployed by common_sbin from common/tools/sbin/).
+wait_for_salt_minion_ready:
+  cmd.run:
+    - name: /usr/sbin/so-salt-minion-wait
+    - onchanges:
+      - service: salt_minion_service
+    - require:
+      - service: salt_minion_service
+    - order: last
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Block until the local salt-minion service is back up and can execute modules locally.
+# Invoked from the wait_for_salt_minion_ready state in salt/minion/init.sls after
+# salt_minion_service fires its watch-driven mod_watch (a non-blocking systemctl restart),
+# so follow-on jobs and the next highstate iteration do not race the in-flight restart.
+
+. /usr/sbin/so-common
+
+# Initial sleep gives the systemctl restart (--no-block by default for salt-minion on
+# >=3006.15) time to begin tearing down the old process before we probe for readiness.
+INITIAL_SLEEP=3
+TIMEOUT=120
+PING_TIMEOUT=5
+
+sleep "$INITIAL_SLEEP"
+
+elapsed="$INITIAL_SLEEP"
+while [ "$elapsed" -lt "$TIMEOUT" ]; do
+  if systemctl is-active --quiet salt-minion \
+     && salt-call --local --timeout="$PING_TIMEOUT" --out=quiet test.ping >/dev/null 2>&1; then
+    echo "salt-minion ready after ${elapsed}s"
+    exit 0
+  fi
+  sleep 1
+  elapsed=$((elapsed + 1))
+done
+
+echo "salt-minion did not become ready within ${TIMEOUT}s" >&2
+exit 1
@@ -1,10 +1,26 @@
-{%   from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'global/map.jinja' import GLOBALMERGED %}

 highstate_schedule:
  schedule.present:
    - function: state.highstate
-    - minutes: 15
+    - hours: {{ GLOBALMERGED.push.highstate_interval_hours }}
    - maxrunning: 1
 {% if not GLOBALS.is_manager %}
-    - splay: 120
+    - splay: 1800
+{% endif %}
+
+{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
+push_drain_schedule:
+  schedule.present:
+    - function: cmd.run
+    - job_args:
+      - /usr/sbin/so-push-drainer
+    - seconds: {{ GLOBALMERGED.push.drain_interval }}
+    - maxrunning: 1
+    - return_job: False
+{% elif GLOBALS.is_manager %}
+push_drain_schedule:
+  schedule.absent:
+    - name: push_drain_schedule
 {% endif %}
@@ -14,6 +14,7 @@ include:
 so-sensoroni:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - network_mode: host
    - binds:
      - /nsm/import:/nsm/import:rw
@@ -1464,6 +1464,7 @@ soc:
          sigmaRulePackages:
            - core
            - emerging_threats_addon
+          useEsql: false
        elastic:
          hostUrl:
          remoteHostUrls: []
@@ -1508,8 +1509,6 @@ soc:
        assistant:
          systemPromptAddendum: ""
          systemPromptAddendumMaxLength: 50000
-          maxSubSessionTokens: 0
-          maxDelegationDepth: 0
          adapters:
            - name: SOAI
              protocol: securityonion_ai_cloud
@@ -18,6 +18,7 @@ include:
 so-soc:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: soc
    - name: so-soc
    - networks:
@@ -383,6 +383,11 @@ soc:
            global: True
            advanced: False
            helpLink: sigma
+          useEsql:
+            description: "(Pre-release) Use Elasticsearch Piped Query Language (ES|QL) instead of EQL (Elastic Query Language) for Elasticsearch queries. The Sigma converter will output ES|QL instead of EQL, allowing support for correlations."
+            global: True
+            advanced: True
+            forcedType: bool
        elastic:
          index:
            description: Comma-separated list of indices or index patterns (wildcard "*" supported) that SOC will search for records.
@@ -714,16 +719,6 @@ soc:
            description: Maximum length of the system prompt addendum. Longer prompts will be truncated.
            global: True
            advanced: True
-          maxSubSessionTokens:
-            description: Maximum number of output tokens a delegated sub-session may generate across all of its turns. When the budget is reached, the sub-agent is halted and its result is returned to the parent agent. Set to 0 to disable the limit.
-            global: True
-            advanced: True
-            forcedType: int
-          maxDelegationDepth:
-            description: Maximum delegation nesting depth for sub-agents. For example, a value of 2 lets the main agent delegate to a sub-agent that may itself delegate one level deeper. Any deeper delegation is refused and the requesting agent continues without it. Set to 0 to disable the limit.
-            global: True
-            advanced: True
-            forcedType: int
          adapters:
            description: Configuration for AI adapters used by the Onion AI assistant. Please see documentation for help on which fields are required for which protocols.
            global: True
@@ -47,6 +47,10 @@ strelka_backend:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally `on-failure` (not unless-stopped) -- strelka backend shuts
+    # down cleanly during rule reloads and we do not want those clean exits to
+    # trigger an auto-restart. Do not homogenize; see the container
+    # auto-restart section of the plan.
    - restart_policy: on-failure
    - watch:
      - file: strelkasensorcompiledrules
@@ -15,6 +15,7 @@ include:
 strelka_coordinator:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-strelka-coordinator
    - networks:
      - sobridge:
@@ -15,6 +15,7 @@ include:
 strelka_filestream:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/filestream/:/etc/strelka/:ro
      - /nsm/strelka:/nsm/strelka
@@ -15,6 +15,7 @@ include:
 strelka_frontend:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/frontend/:/etc/strelka/:ro
      - /nsm/strelka/log/:/var/log/strelka/:rw
@@ -15,6 +15,7 @@ include:
 strelka_gatekeeper:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-strelka-gatekeeper
    - networks:
      - sobridge:
@@ -15,6 +15,7 @@ include:
 strelka_manager:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/manager/:/etc/strelka/:ro
      {% if DOCKERMERGED.containers['so-strelka-manager'].custom_bind_mounts %}
@@ -18,6 +18,7 @@ so-suricata:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-suricata:{{ GLOBALS.so_version }}
    - privileged: True
+    - restart_policy: unless-stopped
    - environment:
      - INTERFACE={{ GLOBALS.sensor.interface }}
      {% if DOCKERMERGED.containers['so-suricata'].extra_env %}
@@ -7,6 +7,7 @@ so-tcpreplay:
  docker_container.running:
    - network_mode: "host"
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-tcpreplay:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-tcpreplay
    - user: root
    - interactive: True
@@ -18,6 +18,7 @@ include:
 so-telegraf:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-telegraf:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - user: 939
    - group_add: 939,920
    - environment:
@@ -18,6 +18,7 @@ so-zeek:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-zeek:{{ GLOBALS.so_version }}
    - start: True
    - privileged: True
+    - restart_policy: unless-stopped
    {% if DOCKERMERGED.containers['so-zeek'].ulimits %}
    - ulimits:
    {%   for ULIMIT in DOCKERMERGED.containers['so-zeek'].ulimits %}
Author	SHA1	Message	Date
Josh Patterson	c950ac7370	Merge remote-tracking branch 'origin/3/dev' into soupmod	2026-06-22 09:41:16 -04:00
Jorge Reyes	63a2e20698	Merge pull request #15982 from Security-Onion-Solutions/reyesj2/wip don't create stack trace when set -e is disabled	2026-06-18 15:25:41 -05:00
reyesj2	22d5c96bd5	don't create stack trace when set -e is disabled	2026-06-18 14:56:29 -05:00
Mike Reeves	28fdd1eb6f	Merge pull request #15970 from Security-Onion-Solutions/udev Pin NIC names by MAC via udev (run-once) from the common state	2026-06-18 14:28:09 -04:00
Jorge Reyes	b143e1e577	Merge pull request #15979 from Security-Onion-Solutions/reyesj2/wip add context to soup errors	2026-06-17 16:47:49 -05:00
reyesj2	16149df71f	formatting	2026-06-16 18:21:28 -05:00
reyesj2	6a18f35020	add context to soup errors and optional soup debug log with xtrace output	2026-06-16 18:21:28 -05:00
Jason Ertel	aa58225e8f	Merge pull request #15974 from Security-Onion-Solutions/jertel/wip es\|ql defaults	2026-06-16 14:27:54 -04:00
Jorge Reyes	acf48db915	Merge pull request #15978 from Security-Onion-Solutions/reyesj2-patch-1 remove pillar merge	2026-06-16 11:17:56 -05:00
reyesj2	3daed551df	use --fail flag without set -x, since elasticsearch can return a 404 on the template lookup	2026-06-16 11:17:04 -05:00
reyesj2	4456bde1c8	check if template exists without --fail flag	2026-06-16 10:45:53 -05:00
Jorge Reyes	4a6c675223	skip kibana backport if the template doesn't exist	2026-06-16 10:33:11 -05:00
reyesj2	a769d4c680	another unneeded default	2026-06-16 09:32:37 -05:00
reyesj2	f68e3e47a1	remove pillar merge	2026-06-16 09:19:10 -05:00
Jorge Reyes	b81257bf45	Merge pull request #15973 from Security-Onion-Solutions/reyesj2/dlm-support Data stream lifecycle management support	2026-06-15 14:47:51 -05:00
reyesj2	1a423a2434	update message	2026-06-15 14:17:34 -05:00
reyesj2	95cae4c734	remove so-elasticsearch-indices-delete cron when using DLM	2026-06-15 13:32:45 -05:00
reyesj2	596471e140	using new annotation config	2026-06-15 13:31:53 -05:00
reyesj2	d10f21399c	remove comments	2026-06-15 13:31:23 -05:00
Jason Ertel	ae1ddf3817	es\|ql defaults	2026-06-15 12:33:08 -04:00
Mike Reeves	80c39d612c	Pin NIC names by MAC via udev (run-once) from the common state Add so-nic-pin, which writes by-MAC persistent-net udev rules pinning each physical NIC to its current name so a kernel upgrade can't renumber the interfaces Security Onion binds by name (host:mainint, sensor:mainint, bond0). Gated by the drop file /opt/so/state/nic_names_pinned: run-once on highstate, and an admin can pre-create the marker to opt out. Wired into common/init.sls as pin_nic_names, guarded by a matching unless.	2026-06-11 18:40:43 -04:00
reyesj2	c505160480	set default DLM retention 90d	2026-06-11 15:13:28 -05:00
reyesj2	d9f6cde4e1	remove global setting from data_retention annotation	2026-06-11 15:11:29 -05:00
Josh Patterson	0a69833669	Merge remote-tracking branch 'origin/3/dev' into soupmod	2026-06-10 16:19:17 -04:00
reyesj2	cf456dc58c	reuse existing index templates	2026-06-09 23:21:43 -05:00
reyesj2	9aa9ea3255	Iniitial DLM support	2026-06-09 23:19:26 -05:00
Josh Patterson	487e433589	allow full highstate on manager while master locked	2026-06-02 13:58:38 -04:00
Josh Patterson	3328ff362d	add some logging	2026-06-02 10:44:17 -04:00
Josh Patterson	8c17ae0f66	move so-salt-minion-wait	2026-06-01 14:48:54 -04:00
Josh Patterson	f54939b444	Replace inotify pillar watch with postgres audit_settings beacon The active-push feature detected pillar/settings changes via an inotify beacon on the manager watching /opt/so/saltstack/local/pillar. Replace that pillar watch with a custom salt beacon (pillar_db) that polls the SOC so_soc.audit_settings table on a monotonic id watermark, so changes made through SOC drive immediate pushes from the database instead of the files. The suricata/strelka rule inotify watches (and pyinotify) are kept unchanged, since rule-file edits are not recorded in audit_settings. - salt/_beacons/pillar_db.py: new beacon. Polls audit_settings via `docker exec so-postgres psql` (unix-socket trust auth), tracks the last processed id in /opt/so/state/pillar_db_watch.id, seeds to MAX(id) on first run (no history replay), and emits one event per new row. - salt/reactor/push_pillar.sls: consume setting_id/node_id from the beacon event instead of a file path. App = first dotted segment of setting_id, looked up in pillar_push_map.yaml. Empty node_id -> grid-wide actions as is; populated node_id -> the app's state(s) retargeted to that one node. - salt/manager/files/beacons_pushstate.conf.jinja: drop the pillar inotify block, add the pillar_db beacon (interval = push.drain_interval); keep the suricata/strelka inotify watches. - salt/salt/files/reactor_pushstate.conf: map salt/beacon/*/pillar_db/ audit_settings to push_pillar.sls; remove the pillar inotify reactor lines; keep suricata/strelka. The intent -> so-push-drainer -> orch.push_batch pipeline is unchanged. Verified end-to-end on a standalone: a grid-wide telegraf.output change re-applied telegraf fleetwide (container replaced), and a per-host ntp.config.servers change applied ntp to only that node.	2026-05-29 14:55:13 -04:00
Josh Patterson	d48a22e37e	Merge pull request #15944 from Security-Onion-Solutions/jertel/wip Jertel/wip	2026-05-28 14:01:42 -04:00
Josh Patterson	6393d08e86	merge	2026-05-27 08:59:28 -04:00
Josh Patterson	730c828bec	Merge remote-tracking branch 'origin/jertel/wip' into saltthangs	2026-05-19 10:23:45 -04:00
Josh Patterson	b4e5171415	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-14 08:03:45 -04:00
Josh Patterson	84decc1db6	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-13 14:09:15 -04:00
Josh Patterson	7d4d6a0756	prune images if so-docker-prune exists	2026-05-08 10:13:15 -04:00
Josh Patterson	66c0a662fc	convert wait to script	2026-05-08 09:26:42 -04:00
Josh Patterson	778cc055ea	wait for salt-minion service to be ready before finishing state run	2026-05-07 17:01:20 -04:00
Josh Patterson	932deab751	update the push map	2026-05-07 10:51:53 -04:00
Josh Patterson	1281f0ee37	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-06 09:46:12 -04:00
Josh Patterson	f774334b6c	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-06 08:16:41 -04:00
Josh Patterson	7fcace34c4	add sensoroni to push map	2026-04-30 16:09:08 -04:00
Josh Patterson	9541024eb7	fix broken things	2026-04-30 15:35:24 -04:00
Josh Patterson	0d166ef732	remove trailing slashes	2026-04-30 09:53:00 -04:00
Josh Patterson	f7d2994f8b	filter temp files	2026-04-30 09:16:22 -04:00
Josh Patterson	8f0757606d	include salt..minion	2026-04-29 16:42:19 -04:00
Josh Patterson	0a8f2e01a0	install pyinotify	2026-04-29 16:41:56 -04:00
Josh Patterson	4546d7bc52	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-29 14:28:19 -04:00
Josh Patterson	17849d8758	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 15:49:22 -04:00
Josh Patterson	d3d30a587c	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 15:30:31 -04:00
Josh Patterson	034711d148	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 10:47:29 -04:00
Mike Reeves	a0cf0489d6	reduce highstate frequency with active push for rules and pillars - schedule highstate every 2 hours (was 15 minutes); interval lives in global:push:highstate_interval_hours so the SOC admin UI can tune it and so-salt-minion-check derives its threshold as (interval + 1) * 3600 - add inotify beacon on the manager + master reactor + orch.push_batch that writes per-app intent files, with a so-push-drainer schedule on the manager that debounces, dedupes, and dispatches a single orchestration - pillar_push_map.yaml allowlists the apps whose pillar changes trigger an immediate targeted state.apply (targets verified against salt/top.sls); edits under pillar/minions/ trigger a state.highstate on that one minion - host-batch every push orchestration (batch: 25%, batch_wait: 15) so rule changes don't thundering-herd large fleets - new global:push:enabled kill-switch tears down the beacon, reactor config, and drainer schedule on the next highstate for operators who want to keep highstate-only behavior - set restart_policy: unless-stopped on 23 container states so docker recovers crashes without waiting for the next highstate; leave registry (always), strelka/backend (on-failure), kratos, and hydra alone with inline comments explaining why	2026-04-10 15:43:16 -04:00
Jason Ertel	613d31c8a6	merge	2026-03-05 11:52:09 -05:00