Merge remote-tracking branch 'origin/3/dev' into saltthangs

Merge pull request #15985 from Security-Onion-Solutions/soupmod2
allow manager two full highstates during soup, improve elastic script runtime
2026-06-23 18:58:12 +02:00 · 2026-06-23 10:32:32 -04:00 · 2026-06-22 17:02:02 -04:00 · 2026-06-22 16:56:18 -04:00 · 2026-06-22 16:52:29 -04:00 · 2026-06-22 16:52:10 -04:00
127 changed files with 5272 additions and 686 deletions
@@ -11,6 +11,7 @@ body:
        -
        - 3.0.0
        - 3.1.0
+        - 3.2.0
        - Other (please provide detail below)
    validations:
      required: true
@@ -1,17 +1,17 @@
-### 3.0.0-20260331 ISO image released on 2026/03/31
+### 3.1.0-20260528 ISO image released on 2026/05/28


 ### Download and Verify

-3.0.0-20260331 ISO image:  
-https://download.securityonion.net/file/securityonion/securityonion-3.0.0-20260331.iso
+3.1.0-20260528 ISO image:  
+https://download.securityonion.net/file/securityonion/securityonion-3.1.0-20260528.iso
 
-MD5: ECD318A1662A6FDE0EF213F5A9BD4B07  
-SHA1: E55BE314440CCF3392DC0B06BC5E270B43176D9C  
-SHA256: 7FC47405E335CBE5C2B6C51FE7AC60248F35CBE504907B8B5A33822B23F8F4D5  
+MD5: 9D6FF58DEEE24089D722C73169765B3E  
+SHA1: 2B8B816B6CEC3B7F96B3C5E040EBF502DD2C412F  
+SHA256: 62FAB57E247C843D6A04F0796D8162C732B65D82FC3E4A59D087135B9FD32912  

 Signature for ISO image:  
-https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.0.0-20260331.iso.sig
+https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.1.0-20260528.iso.sig

 Signing key:  
 https://raw.githubusercontent.com/Security-Onion-Solutions/securityonion/3/main/KEYS  
@@ -25,22 +25,22 @@ wget https://raw.githubusercontent.com/Security-Onion-Solutions/securityonion/3/

 Download the signature file for the ISO:  
 ```
-wget https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.0.0-20260331.iso.sig
+wget https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.1.0-20260528.iso.sig
 ```

 Download the ISO image:  
 ```
-wget https://download.securityonion.net/file/securityonion/securityonion-3.0.0-20260331.iso
+wget https://download.securityonion.net/file/securityonion/securityonion-3.1.0-20260528.iso
 ```

 Verify the downloaded ISO image using the signature file:  
 ```
-gpg --verify securityonion-3.0.0-20260331.iso.sig securityonion-3.0.0-20260331.iso
+gpg --verify securityonion-3.1.0-20260528.iso.sig securityonion-3.1.0-20260528.iso
 ```

 The output should show "Good signature" and the Primary key fingerprint should match what's shown below:
 ```
-gpg: Signature made Mon 30 Mar 2026 06:22:14 PM EDT using RSA key ID FE507013
+gpg: Signature made Wed 27 May 2026 03:03:59 PM EDT using RSA key ID FE507013
 gpg: Good signature from "Security Onion Solutions, LLC <info@securityonionsolutions.com>"
 gpg: WARNING: This key is not certified with a trusted signature!
 gpg:          There is no indication that the signature belongs to the owner.
@@ -0,0 +1 @@
+
@@ -1 +1 @@
-3.1.0
+3.2.0
@@ -0,0 +1,142 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Custom salt beacon that watches the SOC audit_settings table in postgres for
+# new settings changes and emits a beacon event per new row. This replaces the
+# inotify watch on /opt/so/saltstack/local/pillar -- instead of monitoring pillar
+# files on disk, we monitor the so_soc.audit_settings table that SOC writes to.
+#
+# Detection is poll-based with a monotonic `id` watermark persisted to
+# WATERMARK_FILE: each pass selects rows with id greater than the last id seen,
+# which makes it self-healing (a missed poll simply catches up on the next one).
+#
+# Each emitted event carries setting_id and node_id; the push_pillar reactor maps
+# setting_id -> app via pillar_push_map.yaml and writes a push intent, after which
+# the existing so-push-drainer / orch.push_batch pipeline takes over unchanged.
+
+import logging
+import os
+import subprocess
+
+log = logging.getLogger(__name__)
+
+WATERMARK_FILE = '/opt/so/state/pillar_db_watch.id'
+CONTAINER = 'so-postgres'
+DATABASE = 'so_soc'
+
+# Unaligned, tuples-only psql output with a field separator that cannot appear in
+# an id/setting_id/node_id, so we can split each row reliably.
+FIELD_SEP = '\x1f'
+
+
+def __virtual__():
+    return True
+
+
+def validate(config):
+    return True, 'valid'
+
+
+def _read_watermark():
+    # Returns the last processed id, or None if the watermark has not been seeded.
+    try:
+        with open(WATERMARK_FILE, 'r') as f:
+            return int((f.read() or '').strip())
+    except (IOError, ValueError):
+        return None
+
+
+def _write_watermark(value):
+    try:
+        os.makedirs(os.path.dirname(WATERMARK_FILE), exist_ok=True)
+        tmp = WATERMARK_FILE + '.tmp'
+        with open(tmp, 'w') as f:
+            f.write(str(int(value)))
+        os.rename(tmp, WATERMARK_FILE)
+    except OSError:
+        log.exception('pillar_db beacon: failed to persist watermark to %s', WATERMARK_FILE)
+
+
+def _query(sql):
+    # Run a query against so_soc inside the so-postgres container over the unix
+    # socket (trust auth, no password). Returns stdout on success, or None on any
+    # failure so the caller can no-op and retry on the next interval.
+    cmd = [
+        'docker', 'exec', CONTAINER,
+        'psql', '-U', 'postgres', '-d', DATABASE,
+        '-tA', '-F', FIELD_SEP, '-c', sql,
+    ]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    except subprocess.TimeoutExpired:
+        log.warning('pillar_db beacon: psql timed out')
+        return None
+    except Exception:
+        log.exception('pillar_db beacon: failed to exec psql')
+        return None
+    if result.returncode != 0:
+        log.warning('pillar_db beacon: psql failed (rc=%s): %s',
+                    result.returncode, (result.stderr or '').strip())
+        return None
+    return result.stdout
+
+
+def beacon(config):
+    retval = []
+
+    watermark = _read_watermark()
+
+    # First run / missing watermark: seed to the current MAX(id) and emit nothing
+    # so we never replay the entire settings history into a fleetwide push.
+    if watermark is None:
+        seed = _query('SELECT COALESCE(MAX(id), 0) FROM audit_settings;')
+        if seed is None:
+            return retval  # postgres not ready yet; retry next interval
+        try:
+            _write_watermark(int((seed or '0').strip() or 0))
+        except ValueError:
+            log.warning('pillar_db beacon: could not parse MAX(id) seed: %r', seed)
+        return retval
+
+    rows = _query(
+        "SELECT id, setting_id, COALESCE(node_id, '') FROM audit_settings "
+        "WHERE id > %d ORDER BY id;" % watermark
+    )
+    if rows is None:
+        return retval
+
+    max_id = watermark
+    for line in rows.splitlines():
+        # Do NOT str.strip() the whole line: Python treats the \x1f field
+        # separator (and \x1c-\x1e) as whitespace, so stripping would eat an
+        # empty trailing node_id field and make the row look malformed.
+        if not line.strip():
+            continue
+        parts = line.split(FIELD_SEP)
+        if len(parts) < 3:
+            log.warning('pillar_db beacon: skipping malformed row: %r', line)
+            continue
+        try:
+            row_id = int(parts[0])
+        except ValueError:
+            log.warning('pillar_db beacon: skipping row with non-int id: %r', line)
+            continue
+        setting_id = parts[1]
+        node_id = parts[2]
+        retval.append({
+            'tag': 'audit_settings',
+            'id': row_id,
+            'setting_id': setting_id,
+            'node_id': node_id,
+        })
+        if row_id > max_id:
+            max_id = row_id
+
+    if max_id > watermark:
+        _write_watermark(max_id)
+        log.info('pillar_db beacon: emitted %d change(s), watermark %d -> %d',
+                 len(retval), watermark, max_id)
+
+    return retval
@@ -25,9 +25,11 @@ if [ ! -f $BACKUPFILE ]; then
  # Create empty backup file
  tar -cf $BACKUPFILE -T /dev/null

-  # Loop through all paths defined in global.sls, and append them to backup file
+  # Loop through all paths defined in global.sls, and append them to backup file if they exist
  {%- for LOCATION in BACKUPLOCATIONS %}
-  tar -rf $BACKUPFILE "${EXCLUSIONS[@]}" {{ LOCATION }}
+  if [[ -d {{ LOCATION }} || -f {{ LOCATION }} ]]; then
+    tar -rf $BACKUPFILE "${EXCLUSIONS[@]}" {{ LOCATION }}
+  fi
  {%- endfor %}

 fi
@@ -130,6 +130,17 @@ common_sbin:
      - so-pcap-import
 {% endif %}

+# Pin physical NIC names by MAC (run-once) so a kernel upgrade can't renumber the
+# interfaces SO binds by name. The marker keeps it a one-time setup; an admin can
+# pre-create the marker to opt out.
+pin_nic_names:
+  cmd.run:
+    - name: /usr/sbin/so-nic-pin
+    - unless: 'test -e /opt/so/state/nic_names_pinned'
+    - require:
+      - file: common_sbin
+      - file: statedir
+
 common_sbin_jinja:
  file.recurse:
    - name: /usr/sbin
@@ -142,6 +142,11 @@ check_elastic_license() {
 	fi  
 }

+check_elasticsearch_responsive() {
+    retry 3 15 "so-elasticsearch-query / --output /dev/null --fail" ||
+        fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
+}
+
 check_salt_master_status() {
 	local count=0
    local attempts="${1:- 10}"
@@ -165,6 +165,8 @@ if [[ $EXCLUDE_FALSE_POSITIVE_ERRORS == 'Y' ]]; then
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|upgrading component template"  # false positive (elasticsearch index or template names contain 'error')
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|upgrading composable template" # false positive (elasticsearch composable template names contain 'error')
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Error while parsing document for index \[.ds-logs-kratos-so-.*object mapping for \[file\]" # false positive (mapping error occuring BEFORE kratos index has rolled over in 2.4.210)
+    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|No such container"            # false positive (telegraf trying to run stats on an old container)
+    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|passwords do not match"       # false positive (automated hydra test)
 fi

 if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then
@@ -0,0 +1,76 @@
+#!/bin/bash
+#
+# so-nic-pin — pin physical NIC names by permanent MAC via classic by-MAC udev
+#              rules, so a kernel upgrade can't renumber them.
+#
+# Security Onion binds its management and monitor interfaces BY NAME in pillar
+# (host:mainint, sensor:mainint, and bond0 is built on a specific physical NIC).
+# A kernel upgrade can change the kernel/systemd-udevd predictable-naming output
+# and renumber those NICs (e.g. enp1s0 -> enp2s0), which breaks the grid: the
+# pillar references a name that no longer exists and bond/bridge bring-up fails.
+#
+# This writes /etc/udev/rules.d/70-persistent-net.rules pinning each PHYSICAL NIC
+# to its CURRENT name by its PERMANENT MAC, freezing the names across future kernel
+# changes. It only writes the rules file; it does NOT live-trigger a rename (the
+# rules apply on the next boot/kernel, and a live rename would be disruptive).
+#
+# Run-once: gated by the drop file /opt/so/state/nic_names_pinned. If the marker is
+# present the script does nothing, so an admin can pre-create it to opt out. Invoked
+# from the common state on every highstate; the marker keeps it a one-time setup.
+
+NET_RULES_FILE="/etc/udev/rules.d/70-persistent-net.rules"
+MARKER="/opt/so/state/nic_names_pinned"
+
+log() { echo -e "[so-nic-pin] $*"; }
+
+# Echo "<name> <permanent-mac>" for every PHYSICAL NIC. A physical NIC is backed by a
+# real device (has device/driver), which excludes bond0/sobridge/docker0/veth*/lo whose
+# MACs are dynamic and must never be pinned. The PERMANENT MAC is used (ethtool -P, with
+# fallbacks), not the current one: an enslaved bond member's current MAC is rewritten to
+# the bond's, so matching on it would be wrong/ambiguous.
+physical_nics() {
+    local path n mac
+    for path in /sys/class/net/*; do
+        n="${path##*/}"
+        [ "$n" = "lo" ] && continue
+        [ -e "${path}/device/driver" ] || continue          # real device only
+        mac="$(ethtool -P "$n" 2>/dev/null | awk '/Permanent address/{print $NF}')"
+        case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/bonding_slave/perm_hwaddr" 2>/dev/null)" ;; esac
+        case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/address" 2>/dev/null)" ;; esac
+        case "$mac" in ""|00:00:00:00:00:00) continue ;; esac
+        echo "$n $mac"
+    done
+}
+
+# Turn "<name> <mac>" lines on stdin into classic by-MAC persistent-net udev rules.
+render_net_rules() {
+    echo "# Generated by so-nic-pin: pin NIC names by MAC so kernel upgrades can't renumber them."
+    echo "# Security Onion binds its management/monitor interfaces by name; do not hand-edit."
+    local n mac
+    while read -r n mac; do
+        [ -n "$n" ] || continue
+        printf 'SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", ATTR{address}=="%s", NAME="%s"\n' \
+            "$mac" "$n"
+    done
+}
+
+[ "$(id -u)" -eq 0 ] || exit 0                   # salt runs us as root; bail quietly otherwise
+[ -e "${MARKER}" ] && exit 0                      # run-once guard (mirrors the state's unless)
+
+nics="$(physical_nics)"
+if [ -z "${nics}" ]; then
+    log "no physical NICs detected — nothing to pin (will retry on next highstate)"
+    exit 0                                         # do NOT drop the marker; let it retry later
+fi
+
+log "pinning physical NICs by permanent MAC:"
+echo "${nics}" | sed 's/^/    /'
+
+[ -f "${NET_RULES_FILE}" ] && cp -f "${NET_RULES_FILE}" "${NET_RULES_FILE}.bak"
+echo "${nics}" | render_net_rules > "${NET_RULES_FILE}" || {
+    log "ERROR: failed to write ${NET_RULES_FILE}"
+    exit 1
+}
+
+mkdir -p "$(dirname "${MARKER}")" && touch "${MARKER}"
+log "wrote ${NET_RULES_FILE} ($(grep -c '^SUBSYSTEM' "${NET_RULES_FILE}") NIC(s) pinned); dropped ${MARKER}"
@@ -1,5 +1,3 @@
-{% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%}
-
 #!/bin/bash
 #
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
@@ -25,7 +23,8 @@ SYSTEM_START_TIME=$(date -d "$(</proc/uptime awk '{print $1}') seconds ago" +%s)
 LAST_HIGHSTATE_END=$([ -e "/opt/so/log/salt/lasthighstate" ] && date -r /opt/so/log/salt/lasthighstate +%s || echo 0)
 LAST_HEALTHCHECK_STATE_APPLY=$([ -e "/opt/so/log/salt/state-apply-test" ] && date -r /opt/so/log/salt/state-apply-test +%s || echo 0)
 # SETTING THRESHOLD TO ANYTHING UNDER 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
-THRESHOLD={{SALT_MINION_DEFAULTS.salt.minion.check_threshold}} #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
+# THRESHOLD is derived from the global push highstate interval + 1 hour, so the minion-check grace period tracks the schedule automatically.
+THRESHOLD=$(( ({{ salt['pillar.get']('global:push:highstate_interval_hours', 2) }} + 1) * 3600 )) #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
 THRESHOLD_DATE=$((LAST_HEALTHCHECK_STATE_APPLY+THRESHOLD))

 logCmd() {
@@ -9,7 +9,8 @@
 prune_images:
  cmd.run:
    - name: so-docker-prune
-    - order: last
+    - onlyif: command -v /usr/sbin/so-docker-prune >/dev/null 2>&1
+    - order: 9000

 {% else %}

@@ -19,6 +19,7 @@ wait_for_elasticsearch:
 so-elastalert:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastalert:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: elastalert
    - name: so-elastalert
    - user: so-elastalert
@@ -15,6 +15,7 @@ include:
 so-elastic-fleet-package-registry:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-fleet-package-registry:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-fleet-package-registry
    - hostname: Fleet-package-reg-{{ GLOBALS.hostname }}
    - detach: True
@@ -16,6 +16,7 @@ include:
 so-elastic-agent:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-agent
    - hostname: {{ GLOBALS.hostname }}
    - detach: True
@@ -9,7 +9,6 @@

 {% set CORE_ESFLEET_PACKAGES = ELASTICFLEETDEFAULTS.get('elasticfleet', {}).get('packages', {}) %}
 {% set ADDON_CONTENT_INTEGRATION_DEFAULTS = {} %}
-{% set DEBUG_STUFF = {} %}

 {% for pkg in ADDON_CONTENT_PACKAGE_COMPONENTS %}
 {%   if pkg.name in CORE_ESFLEET_PACKAGES %}
@@ -26,7 +26,9 @@ include:
 wait_for_elasticsearch_elasticfleet:
  cmd.run:
    - name: so-elasticsearch-wait
+{% endif %}

+{% if GLOBALS.role == "so-fleet" %}
 # Sync Elastic Agent artifacts to Fleet Node
 elasticagent_syncartifacts:
  file.recurse:
@@ -40,6 +42,7 @@ elasticagent_syncartifacts:
 so-elastic-fleet:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-elastic-fleet
    - hostname: FleetServer-{{ GLOBALS.hostname }}
    - detach: True
@@ -99,6 +102,17 @@ so-elastic-fleet:
      - file: trusttheca
      - x509: etc_elasticfleet_key
      - x509: etc_elasticfleet_crt
+
+wait_for_so-elastic-fleet:
+  http.wait_for_successful_query:
+    - name: "https://localhost:8220/api/status"
+    - ssl: True
+    - verify_ssl: False
+    - status: 200
+    - wait_for: 300
+    - request_interval: 15
+    - require:
+      - docker_container: so-elastic-fleet
 {%   endif %}

 delete_so-elastic-fleet_so-status.disabled:
@@ -9,7 +9,6 @@

 {% set CORE_ESFLEET_PACKAGES = ELASTICFLEETDEFAULTS.get('elasticfleet', {}).get('packages', {}) %}
 {% set ADDON_INPUT_INTEGRATION_DEFAULTS = {} %}
-{% set DEBUG_STUFF = {} %}

 {% for pkg in ADDON_INPUT_PACKAGE_COMPONENTS %}
 {%   if pkg.name in CORE_ESFLEET_PACKAGES %}
@@ -116,7 +115,6 @@


 {%         do ADDON_INPUT_INTEGRATION_DEFAULTS.update({integration_key: integration_defaults}) %}
-{%         do DEBUG_STUFF.update({integration_key: "Generating defaults for "+ pkg.name })%}
 {%       endfor %}
 {%     endif %}
 {%   endif %}
@@ -9,16 +9,20 @@

 include:
  - elasticfleet.config
+  - kibana.enabled

 # If enabled, automatically update Fleet Logstash Outputs
-{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval'] %}
+{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration %}
+{%   if grains.role not in ['so-import', 'so-eval']%}
 so-elastic-fleet-auto-configure-logstash-outputs:
  cmd.run:
    - name: /usr/sbin/so-elastic-fleet-outputs-update
    - retry:
        attempts: 4
        interval: 30
-{% endif %}
+    - require:
+      - http: wait_for_so-kibana
+{%   endif %}

 # If enabled, automatically update Fleet Server URLs & ES Connection
 so-elastic-fleet-auto-configure-server-urls:
@@ -27,6 +31,9 @@ so-elastic-fleet-auto-configure-server-urls:
    - retry:
        attempts: 4
        interval: 30
+    - require:
+      - http: wait_for_so-kibana
+{% endif %}

 # Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
 so-elastic-fleet-auto-configure-elasticsearch-urls:
@@ -35,6 +42,8 @@ so-elastic-fleet-auto-configure-elasticsearch-urls:
    - retry:
        attempts: 4
        interval: 30
+    - require:
+      - http: wait_for_so-kibana

 so-elastic-fleet-auto-configure-artifact-urls:
  cmd.run:
@@ -42,6 +51,8 @@ so-elastic-fleet-auto-configure-artifact-urls:
    - retry:
        attempts: 4
        interval: 30
+    - require:
+      - http: wait_for_so-kibana

 so-elastic-fleet-package-statefile:
  file.managed:
@@ -53,7 +64,9 @@ so-elastic-fleet-package-upgrade:
    - name: /usr/sbin/so-elastic-fleet-package-upgrade
    - retry:
        attempts: 3
-        interval: 10
+        interval: 30
+    - require:
+      - http: wait_for_so-kibana
    - onchanges:
      - file: /opt/so/state/elastic_fleet_packages.txt

@@ -63,6 +76,8 @@ so-elastic-fleet-integrations:
    - retry:
        attempts: 3
        interval: 10
+    - require:
+      - http: wait_for_so-kibana

 so-elastic-agent-grid-upgrade:
  cmd.run:
@@ -70,6 +85,8 @@ so-elastic-agent-grid-upgrade:
    - retry:
        attempts: 12
        interval: 5
+    - require:
+      - http: wait_for_so-kibana

 so-elastic-fleet-integration-upgrade:
  cmd.run:
@@ -77,16 +94,22 @@ so-elastic-fleet-integration-upgrade:
    - retry:
        attempts: 3
        interval: 10
+    - require:
+      - http: wait_for_so-kibana

 {# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
 so-elastic-fleet-addon-integrations:
  cmd.run:
    - name: /usr/sbin/so-elastic-fleet-optional-integrations-load
+    - require:
+      - http: wait_for_so-kibana

 {% if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
 so-elastic-defend-manage-filters-file-watch:
  cmd.run:
    - name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
+    - require:
+      - http: wait_for_so-kibana
    - onchanges:
      - file: elasticdefendcustom
      - file: elasticdefenddisabled
@@ -30,6 +30,94 @@ fleet_api() {
    curl -sK /opt/so/conf/elasticsearch/curl.config -L "localhost:5601/api/fleet/${QUERYPATH}" "$@" --retry 3 --retry-delay 10 --fail 2>/dev/null
 }

+# Max number of concurrent Fleet write jobs (create/update). Override via env if needed.
+MAX_FLEET_JOBS=${MAX_FLEET_JOBS:-10}
+
+# Block until fewer than MAX_FLEET_JOBS background jobs are running.
+elastic_fleet_throttle() {
+    while (( $(jobs -rp | wc -l) >= MAX_FLEET_JOBS )); do
+        wait -n || true
+    done
+}
+
+# Load every integration JSON in a directory into a single agent policy.
+# The agent policy is fetched ONCE (not per file), and the create/update writes
+# are dispatched as throttled background jobs.
+#   $1 AGENT_POLICY     - the agent policy id/name to load integrations into
+#   $2 DIR              - directory of integration *.json files
+#   $3 LABEL           - human-readable label for log output
+#   $4 SKIP_CREATE_NAME - (optional) integration name to skip when creating (still updated if present)
+# Returns 1 if the policy cannot be fetched or if any integration failed to create/update.
+elastic_fleet_load_integrations_dir() {
+    local AGENT_POLICY=$1
+    local DIR=$2
+    local LABEL=$3
+    local SKIP_CREATE_NAME=$4
+    local POLICY_JSON FAIL_FILE OUT_DIR INTEGRATION NAME ID i
+
+    FAIL_FILE=$(mktemp)
+    # Each job buffers its full output (header + API response) into its own file so the
+    # parent can print them grouped and in submission order after concurrent writes finish.
+    OUT_DIR=$(mktemp -d)
+    i=0
+
+    # Fetch the agent policy a single time; we look up integration ids locally below.
+    if ! POLICY_JSON=$(fleet_api "agent_policies/$AGENT_POLICY"); then
+        echo "Error: Failed to retrieve agent policy '$AGENT_POLICY'."
+        rm -f "$FAIL_FILE"
+        rm -rf "$OUT_DIR"
+        return 1
+    fi
+
+    if ! jq -e '.item.package_policies' <<<"$POLICY_JSON" >/dev/null 2>&1; then
+        echo "Error: Invalid agent policy response for '$AGENT_POLICY'."
+        rm -f "$FAIL_FILE"
+        rm -rf "$OUT_DIR"
+        return 1
+    fi
+
+    for INTEGRATION in "$DIR"/*.json; do
+        [ -e "$INTEGRATION" ] || continue
+        NAME=$(jq -r .name "$INTEGRATION")
+        ID=$(jq -r --arg n "$NAME" '.item.package_policies[]? | select(.name==$n) | .id' <<<"$POLICY_JSON")
+
+        elastic_fleet_throttle
+        {
+            local RESP
+            if [ -n "$ID" ]; then
+                printf "\n\n%s - Updating integration %s\n" "$LABEL" "$NAME"
+                if ! RESP=$(elastic_fleet_integration_update "$ID" "@$INTEGRATION"); then
+                    flock 9; echo "update ${INTEGRATION##*/}" >&9
+                fi
+                printf '%s\n' "$RESP"
+            elif [ -n "$SKIP_CREATE_NAME" ] && [ "$NAME" == "$SKIP_CREATE_NAME" ]; then
+                printf "\n\n%s - Skipping creation of %s\n" "$LABEL" "$NAME"
+            else
+                printf "\n\n%s - Creating integration %s\n" "$LABEL" "$NAME"
+                if ! RESP=$(elastic_fleet_integration_create "@$INTEGRATION"); then
+                    flock 9; echo "create ${INTEGRATION##*/}" >&9
+                fi
+                printf '%s\n' "$RESP"
+            fi
+        } >"$OUT_DIR/$(printf '%03d' "$i")" 9>>"$FAIL_FILE" &
+        i=$((i+1))
+    done
+    wait || true
+
+    # Emit per-integration output grouped and in submission order (glob sorts numerically).
+    cat "$OUT_DIR"/* 2>/dev/null
+    rm -rf "$OUT_DIR"
+
+    local rc=0
+    if [ -s "$FAIL_FILE" ]; then
+        printf "\n%s: failed integrations:\n" "$LABEL"
+        cat "$FAIL_FILE"
+        rc=1
+    fi
+    rm -f "$FAIL_FILE"
+    return $rc
+}
+
 elastic_fleet_integration_check() {

    AGENT_POLICY=$1
@@ -46,7 +134,9 @@ elastic_fleet_integration_create() {

    JSON_STRING=$1

-    if ! fleet_api "package_policies" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPOST -d "$JSON_STRING"; then
+    # --retry-all-errors so transient 409 conflicts (concurrent writes to the same agent
+    # policy) are retried; curl --retry alone does not retry 409.
+    if ! fleet_api "package_policies" --retry-all-errors -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPOST -d "$JSON_STRING"; then
        return 1
    fi
 }
@@ -77,7 +167,9 @@ elastic_fleet_integration_update() {

    JSON_STRING=$2

-    if ! fleet_api "package_policies/$UPDATE_ID" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPUT -d "$JSON_STRING"; then
+    # --retry-all-errors so transient 409 conflicts (concurrent writes to the same agent
+    # policy) are retried; curl --retry alone does not retry 409.
+    if ! fleet_api "package_policies/$UPDATE_ID" --retry-all-errors -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPUT -d "$JSON_STRING"; then
        return 1
    fi
 }
@@ -18,99 +18,35 @@ if [ ! -f /opt/so/state/eaintegrations.txt ]; then
  # Third, configure Elastic Defend Integration seperately
  /usr/sbin/so-elastic-fleet-integration-policy-elastic-defend

+  # Each group fetches its agent policy once and dispatches create/update writes concurrently.
+
  # Initial Endpoints
-  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/endpoints-initial/*.json; do
-    printf "\n\nInitial Endpoints Policy - Loading $INTEGRATION\n"
-    elastic_fleet_integration_check "endpoints-initial" "$INTEGRATION"
-    if [ -n "$INTEGRATION_ID" ]; then
-      printf "\n\nIntegration $NAME exists - Updating integration\n"
-      if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
-        echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
-        RETURN_CODE=1
-        continue
-      fi
-    else
-      printf "\n\nIntegration does not exist - Creating integration\n"
-      if ! elastic_fleet_integration_create "@$INTEGRATION"; then
-        echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
-        RETURN_CODE=1
-        continue
-      fi
-    fi
-  done
+  elastic_fleet_load_integrations_dir "endpoints-initial" \
+    /opt/so/conf/elastic-fleet/integrations/endpoints-initial "Initial Endpoints Policy" || RETURN_CODE=1

  # Grid Nodes - General
-  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/grid-nodes_general/*.json; do
-    printf "\n\nGrid Nodes Policy_General - Loading $INTEGRATION\n"
-    elastic_fleet_integration_check "so-grid-nodes_general" "$INTEGRATION"
-    if [ -n "$INTEGRATION_ID" ]; then
-      printf "\n\nIntegration $NAME exists - Updating integration\n"
-      if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
-        echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
-        RETURN_CODE=1
-        continue
-      fi
-    else
-      printf "\n\nIntegration does not exist - Creating integration\n"
-      if ! elastic_fleet_integration_create "@$INTEGRATION"; then
-        echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
-        RETURN_CODE=1
-        continue
-      fi
-    fi
-  done
+  elastic_fleet_load_integrations_dir "so-grid-nodes_general" \
+    /opt/so/conf/elastic-fleet/integrations/grid-nodes_general "Grid Nodes Policy_General" || RETURN_CODE=1

  # Grid Nodes - Heavy
-  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/grid-nodes_heavy/*.json; do
-    printf "\n\nGrid Nodes Policy_Heavy - Loading $INTEGRATION\n"
-    elastic_fleet_integration_check "so-grid-nodes_heavy" "$INTEGRATION"
-    if [ -n "$INTEGRATION_ID" ]; then
-      printf "\n\nIntegration $NAME exists - Updating integration\n"
-      if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
-        echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
-        RETURN_CODE=1
-        continue
-      fi
-    else
-      printf "\n\nIntegration does not exist - Creating integration\n"
-      if ! elastic_fleet_integration_create "@$INTEGRATION"; then
-        echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
-        RETURN_CODE=1
-        continue
-      fi
-    fi
-  done
+  elastic_fleet_load_integrations_dir "so-grid-nodes_heavy" \
+    /opt/so/conf/elastic-fleet/integrations/grid-nodes_heavy "Grid Nodes Policy_Heavy" || RETURN_CODE=1

-  # Fleet Server - Optional integrations
-  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/*.json; do
-    if ! [ "$INTEGRATION" == "/opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/*.json" ]; then
-      FLEET_POLICY=`echo "$INTEGRATION"| cut -d'/' -f7`
-      printf "\n\nFleet Server Policy - Loading $INTEGRATION\n"
-      elastic_fleet_integration_check "$FLEET_POLICY" "$INTEGRATION"
-      if [ -n "$INTEGRATION_ID" ]; then
-        printf "\n\nIntegration $NAME exists - Updating integration\n"
-        if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
-          echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
-          RETURN_CODE=1
-          continue
-        fi
-      else
-        printf "\n\nIntegration does not exist - Creating integration\n"
-        if [ "$NAME" != "elasticsearch-logs" ]; then
-          if ! elastic_fleet_integration_create "@$INTEGRATION"; then
-            echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
-            RETURN_CODE=1
-            continue
-          fi
-        fi
-      fi
-    fi
+  # Fleet Server - Optional integrations (one agent policy per FleetServer_* directory)
+  for FLEET_DIR in /opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/; do
+    [ -d "$FLEET_DIR" ] || continue
+    FLEET_POLICY=$(basename "$FLEET_DIR")
+    elastic_fleet_load_integrations_dir "$FLEET_POLICY" \
+      "${FLEET_DIR%/}" "Fleet Server Policy" "elasticsearch-logs" || RETURN_CODE=1
  done

  # Only create the state file if all policies were created/updated successfully
-  if [[ "$RETURN_CODE" != "1" ]]; then
+  if [[ $RETURN_CODE -eq 0 ]]; then
    touch /opt/so/state/eaintegrations.txt
+  else
+    exit 1
  fi
 else
-  exit $RETURN_CODE
+  echo "Fleet integration policies already loaded."
+  exit 0
 fi
@@ -23,73 +23,90 @@ if [ $? -ne 0 ]; then
 fi

 default_packages=({% for pkg in SUPPORTED_PACKAGES %}"{{ pkg }}"{% if not loop.last %} {% endif %}{% endfor %})
+# JSON array of the default packages, used by the jq filter below.
+default_packages_json=$(printf '%s\n' "${default_packages[@]}" | jq -R . | jq -s '.')
+
+# Output lock (serializes concurrent job output) and failure file (one marker line per
+# failed integration). Mirrors the pattern used by elastic_fleet_load_integrations_dir.
+OUTPUT_LOCK=$(mktemp)
+FAIL_FILE=$(mktemp)
+trap 'rm -f "$OUTPUT_LOCK" "$FAIL_FILE"' EXIT
+
+# Cache of package name -> latest available version, so the same package is only looked up
+# once instead of once per (policy, integration).
+declare -A LATEST_VERSION_CACHE

-ERROR=false
 for AGENT_POLICY in $agent_policies; do
-    if ! integrations=$(elastic_fleet_integration_policy_names "$AGENT_POLICY"); then
+    # Fetch the agent policy a single time; package name/version and integration id are all
+    # extracted locally below instead of re-fetching the same policy per integration.
+    if ! POLICY_JSON=$(fleet_api "agent_policies/$AGENT_POLICY"); then
        # this script upgrades default integration packages, exit 1 and let salt handle retrying
        exit 1
    fi
-    for INTEGRATION in $integrations; do
-        if ! [[ "$INTEGRATION" == "elastic-defend-endpoints" ]] && ! [[ "$INTEGRATION" == "fleet_server-"* ]]; then
-            # Get package name so we know what package to look for when checking the current and latest available version
-            if ! PACKAGE_NAME=$(elastic_fleet_integration_policy_package_name "$AGENT_POLICY" "$INTEGRATION"); then
+
+    # One jq pass emits name/package.name/package.version/id for every eligible integration.
+    # The endpoint/fleet_server skips and the default-package gate are applied here in jq.
+    # $defaults (not $def, a jq reserved keyword) holds the default package list.
+    while IFS=$'\t' read -r INTEGRATION PACKAGE_NAME PACKAGE_VERSION INTEGRATION_ID; do
+        [ -n "$INTEGRATION" ] || continue
+
+        # Look up the latest available version once per package, then memoize it.
+        if [[ -z "${LATEST_VERSION_CACHE[$PACKAGE_NAME]+set}" ]]; then
+            if ! AVAILABLE_VERSION=$(elastic_fleet_package_latest_version_check "$PACKAGE_NAME"); then
+                echo "Error: Failed getting latest version for $PACKAGE_NAME"
                exit 1
            fi
-            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-            if [[ " ${default_packages[@]} " =~ " $PACKAGE_NAME " ]]; then
-            {%- endif %}
-                # Get currently installed version of package
-                attempt=0
-                max_attempts=3
-                while [ $attempt -lt $max_attempts ]; do
-                    if PACKAGE_VERSION=$(elastic_fleet_integration_policy_package_version "$AGENT_POLICY" "$INTEGRATION") && AVAILABLE_VERSION=$(elastic_fleet_package_latest_version_check "$PACKAGE_NAME"); then
-                        break
-                    fi
-                    attempt=$((attempt + 1))
-                done
-                if [ $attempt -eq $max_attempts ]; then
-                    echo "Error: Failed getting $PACKAGE_VERSION or $AVAILABLE_VERSION"
-                    exit 1
-                fi
-
-                # Get integration ID
-                if ! INTEGRATION_ID=$(elastic_fleet_integration_id "$AGENT_POLICY" "$INTEGRATION"); then
-                    exit 1
-                fi
-
-                if [[ "$PACKAGE_VERSION" != "$AVAILABLE_VERSION" ]]; then
-                    # Dry run of the upgrade
-                    echo ""
-                    echo "Current $PACKAGE_NAME package version ($PACKAGE_VERSION) is not the same as the latest available package ($AVAILABLE_VERSION)..."
-                    echo "Upgrading $INTEGRATION..."
-                    echo "Starting dry run..."
-                    if ! DRYRUN_OUTPUT=$(elastic_fleet_integration_policy_dryrun_upgrade "$INTEGRATION_ID"); then
-                        exit 1
-                    fi
-                    DRYRUN_ERRORS=$(echo "$DRYRUN_OUTPUT" | jq .[].hasErrors)
-
-                    # If no errors with dry run, proceed with actual upgrade
-                    if [[ "$DRYRUN_ERRORS" == "false" ]]; then
-                        echo "No errors detected. Proceeding with upgrade..."
-                        if ! elastic_fleet_integration_policy_upgrade "$INTEGRATION_ID"; then
-                            echo "Error: Upgrade failed for $PACKAGE_NAME with integration ID '$INTEGRATION_ID'."
-                            ERROR=true
-                            continue
-                        fi
-                    else
-                        echo "Errors detected during dry run for $PACKAGE_NAME policy upgrade..."
-                        ERROR=true
-                        continue
-                    fi
-                fi
-            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-            fi
-            {%- endif %}
+            LATEST_VERSION_CACHE[$PACKAGE_NAME]=$AVAILABLE_VERSION
        fi
-    done
+        AVAILABLE_VERSION=${LATEST_VERSION_CACHE[$PACKAGE_NAME]}
+
+        if [[ "$PACKAGE_VERSION" != "$AVAILABLE_VERSION" ]]; then
+            # Dry run, then (if clean) the actual upgrade, dispatched as a throttled background
+            # job. Each job builds its full log into one block, then flushes it under a single
+            # shared lock (OUTPUT_LOCK) so concurrent jobs never interleave on stdout; a failed
+            # job also appends a marker line to FAIL_FILE while holding that same lock.
+            elastic_fleet_throttle
+            {
+                block=$'\n'"Current $PACKAGE_NAME package version ($PACKAGE_VERSION) is not the same as the latest available package ($AVAILABLE_VERSION)..."$'\n'
+                block+="Upgrading $INTEGRATION..."$'\n'"Starting dry run..."$'\n'
+                fail=""
+                if ! DRYRUN_OUTPUT=$(elastic_fleet_integration_policy_dryrun_upgrade "$INTEGRATION_ID"); then
+                    block+="Error: Failed to complete dry run for '$INTEGRATION_ID'."$'\n'
+                    fail="dryrun $INTEGRATION"
+                elif [[ "$(jq .[].hasErrors <<<"$DRYRUN_OUTPUT")" == "false" ]]; then
+                    block+="No errors detected. Proceeding with upgrade..."$'\n'
+                    if ! elastic_fleet_integration_policy_upgrade "$INTEGRATION_ID"; then
+                        block+="Error: Upgrade failed for $PACKAGE_NAME with integration ID '$INTEGRATION_ID'."$'\n'
+                        fail="upgrade $INTEGRATION"
+                    fi
+                else
+                    block+="Errors detected during dry run for $PACKAGE_NAME policy upgrade..."$'\n'
+                    fail="dryrun-errors $INTEGRATION"
+                fi
+                {
+                    flock 9
+                    printf '%s' "$block"
+                    [ -n "$fail" ] && printf '%s\n' "$fail" >>"$FAIL_FILE"
+                } 9>>"$OUTPUT_LOCK"
+            } &
+        fi
+    done < <(jq -r --argjson defaults "$default_packages_json" '
+        .item.package_policies[]
+        | select(.name != "elastic-defend-endpoints")
+        | select(.name | startswith("fleet_server-") | not)
+        {%- if not AUTO_UPGRADE_INTEGRATIONS %}
+        | select(.package.name | IN($defaults[]))
+        {%- endif %}
+        | [.name, .package.name, .package.version, .id] | @tsv
+    ' <<<"$POLICY_JSON")
 done
-if [[ "$ERROR" == "true" ]]; then
+
+# Barrier: wait for every dispatched dry-run/upgrade job to finish.
+wait
+
+if [ -s "$FAIL_FILE" ]; then
+    printf '\nFailed integration upgrades:\n'
+    cat "$FAIL_FILE"
    exit 1
 fi
 echo
@@ -16,7 +16,6 @@
 STATE_FILE_SUCCESS=/opt/so/state/estemplates.txt
 INSTALLED_PACKAGE_LIST=/tmp/esfleet_installed_packages.json
 BULK_INSTALL_PACKAGE_LIST=/tmp/esfleet_bulk_install.json
-BULK_INSTALL_PACKAGE_TMP=/tmp/esfleet_bulk_install_tmp.json
 BULK_INSTALL_OUTPUT=/opt/so/state/esfleet_bulk_install_results.json
 INTEGRATION_PACKAGE_COMPONENTS=/opt/so/state/esfleet_package_components.json
 INPUT_PACKAGE_COMPONENTS=/opt/so/state/esfleet_input_package_components.json
@@ -29,29 +28,6 @@ PENDING_UPDATE=false
 #   Requiring some level of manual Elastic Stack configuration before installation
 EXCLUDED_INTEGRATIONS=('apm')

-version_conversion(){
-    version=$1
-    echo "$version" | awk -F '.' '{ printf("%d%03d%03d\n", $1, $2, $3); }'
-}
-
-compare_versions() {
-    version1=$1
-    version2=$2
-
-    # Convert versions to numbers
-    num1=$(version_conversion "$version1")
-    num2=$(version_conversion "$version2")
-
-    # Compare using bc
-    if (( $(echo "$num1 < $num2" | bc -l) )); then
-        echo "less"
-    elif (( $(echo "$num1 > $num2" | bc -l) )); then
-        echo "greater"
-    else
-        echo "equal"
-    fi
-}
-
 IFS=$'\n'
 agent_policies=$(elastic_fleet_agent_policy_ids)
 if [ $? -ne 0 ]; then
@@ -63,23 +39,23 @@ default_packages=({% for pkg in SUPPORTED_PACKAGES %}"{{ pkg }}"{% if not loop.l

 in_use_integrations=()

+# Fetch each agent policy once; its package_policies[] already contain both the integration name
+#  and the .package.name, so extract all non-default package names locally in a single jq instead
+#  of re-fetching the same policy per integration.
+default_packages_json=$(printf '%s\n' "${default_packages[@]}" | jq -R . | jq -s '.')
 for AGENT_POLICY in $agent_policies; do

-    if ! integrations=$(elastic_fleet_integration_policy_names "$AGENT_POLICY"); then
+    if ! policy_json=$(fleet_api "agent_policies/$AGENT_POLICY"); then
        # skip the agent policy if we can't get required info, let salt retry. Integrations loaded by this script are non-default integrations.
        echo "Skipping $AGENT_POLICY.. "
        continue
    fi
-    for INTEGRATION in $integrations; do
-        if ! PACKAGE_NAME=$(elastic_fleet_integration_policy_package_name "$AGENT_POLICY" "$INTEGRATION"); then
-            echo  "Not adding $INTEGRATION, couldn't get package name"
-            continue
-        fi
-        # non-default integrations that are in-use in any policy
-        if ! [[ " ${default_packages[@]} " =~ " $PACKAGE_NAME " ]]; then
-            in_use_integrations+=("$PACKAGE_NAME")
-        fi
-    done
+    # non-default integrations that are in-use in any policy
+    while IFS= read -r PACKAGE_NAME; do
+        [ -n "$PACKAGE_NAME" ] && in_use_integrations+=("$PACKAGE_NAME")
+    done < <(jq -r --argjson defaults "$default_packages_json" \
+        '.item.package_policies[].package.name | select(. as $n | ($defaults | index($n)) | not)' \
+        <<<"$policy_json")
 done

 if [[ -f $STATE_FILE_SUCCESS  ]]; then
@@ -90,72 +66,55 @@ if [[ -f $STATE_FILE_SUCCESS  ]]; then
        rm -f $INSTALLED_PACKAGE_LIST
        echo $latest_package_list | jq '{packages: [.items[] | {name: .name, latest_version: .version, installed_version: .installationInfo.version, subscription: .conditions.elastic.subscription }]}' >> $INSTALLED_PACKAGE_LIST

-        while read -r package; do
-            # get package details
-            package_name=$(echo "$package" | jq -r '.name')
-            latest_version=$(echo "$package" | jq -r '.latest_version')
-            installed_version=$(echo "$package" | jq -r '.installed_version')
-            subscription=$(echo "$package" | jq -r '.subscription')
-            bulk_package=$(echo "$package" | jq '{name: .name, version: .latest_version}' )
+        # Build the bulk install list and the per-package status messages with two jq passes
+        #  instead of a per-package bash loop. The old loop forked ~10 processes per package
+        #  (5 jq + awk/bc for the version compare) and re-parsed/rewrote a growing JSON file on
+        #  every add (O(n^2)). Selection and messages below are identical to that logic.
+        SUB={% if SUB %}true{% else %}false{% endif %}
+        AUTOUP={% if AUTO_UPGRADE_INTEGRATIONS %}true{% else %}false{% endif %}
+        EXCLUDED_JSON=$(printf '%s\n' "${EXCLUDED_INTEGRATIONS[@]}" | jq -R 'select(length>0)' | jq -s '.')
+        INUSE_JSON=$(printf '%s\n' "${in_use_integrations[@]}" | jq -R 'select(length>0)' | jq -s 'unique')

-            if [[ ! "${EXCLUDED_INTEGRATIONS[@]}" =~ "$package_name" ]]; then
-            {% if not SUB %}
-                if [[ "$subscription" != "basic" && "$subscription" != "null" && -n "$subscription" ]]; then
-                    # pass over integrations that require non-basic elastic license
-                    echo "$package_name integration requires an Elastic license of $subscription or greater... skipping"
-                    continue
-                else
-                    if [[ "$installed_version" == "null" || -z "$installed_version" ]]; then
-                        echo "$package_name is not installed... Adding to next update."
-                        jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
+        # vnum replicates the previous version_conversion (%d%03d%03d of the first three dotted
+        #  fields); needs() replicates the excluded/subscription/installed/upgrade/in-use logic.
+        JQ_DECISION='
+def vnum:
+  [ (split(".")|.[0:3][] | gsub("[^0-9].*";"") | (if .=="" then "0" else . end) | tonumber) ]
+  | (.[0]//0)*1000000 + (.[1]//0)*1000 + (.[2]//0);
+def needs($sub;$autoup;$excluded;$inuse):
+  .name as $n
+  | ($n | IN($excluded[]) | not)
+  and ( $sub or (.subscription==null or .subscription=="basic" or .subscription=="") )
+  and ( (.installed_version==null or .installed_version=="")
+        or ( ((.latest_version|vnum) > (.installed_version|vnum))
+             and ( $autoup or ($n | IN($inuse[]) | not) ) ) );'

-                        PENDING_UPDATE=true
-                    else
-                        results=$(compare_versions "$latest_version" "$installed_version")
-                        if [ $results == "greater" ]; then
-                            {#- When auto_upgrade_integrations is false, skip upgrading in_use_integrations  #}
-                            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-                            if ! [[ " ${in_use_integrations[@]} " =~ " $package_name " ]]; then
-                            {%- endif %}
-                                echo "$package_name is at version $installed_version latest version is $latest_version... Adding to next update."
-                                jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
+        JQ_ARGS=(--argjson sub "$SUB" --argjson autoup "$AUTOUP" --argjson excluded "$EXCLUDED_JSON" --argjson inuse "$INUSE_JSON")

-                                PENDING_UPDATE=true
-                            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-                            else
-                                echo "skipping available upgrade for in use integration - $package_name."
-                            fi
-                            {%- endif %}
-                        fi
-                    fi
-                fi
-            {% else %}
-                if [[ "$installed_version" == "null" || -z "$installed_version" ]]; then
-                    echo "$package_name is not installed... Adding to next update."
-                    jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
-                    PENDING_UPDATE=true
-                else
-                    results=$(compare_versions "$latest_version" "$installed_version")
-                    if [ $results == "greater" ]; then
-                        {#- When auto_upgrade_integrations is false, skip upgrading in_use_integrations  #}
-                        {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-                        if ! [[ " ${in_use_integrations[@]} " =~ " $package_name " ]]; then
-                        {%- endif %}
-                            echo "$package_name is at version $installed_version latest version is $latest_version... Adding to next update."
-                            jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
-                            PENDING_UPDATE=true
-                        {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-                        else
-                            echo "skipping available upgrade for in use integration - $package_name."
-                        fi
-                        {%- endif %}
-                    fi
-                fi
-            {% endif %}
-            else
-                echo "Skipping $package_name..."
-            fi
-        done <<< "$(jq -c '.packages[]' "$INSTALLED_PACKAGE_LIST")"
+        # (a) Per-package status messages (parity with the previous echo output).
+        jq -r "${JQ_ARGS[@]}" "$JQ_DECISION"'
+          .packages[]
+          | .name as $n
+          | if ($n|IN($excluded[])) then "Skipping \($n)..."
+            elif (($sub|not) and (.subscription!=null and .subscription!="basic" and .subscription!="")) then
+                 "\($n) integration requires an Elastic license of \(.subscription) or greater... skipping"
+            elif (.installed_version==null or .installed_version=="") then
+                 "\($n) is not installed... Adding to next update."
+            elif ((.latest_version|vnum) > (.installed_version|vnum)) then
+                 (if ($autoup or ($n|IN($inuse[])|not))
+                  then "\($n) is at version \(.installed_version) latest version is \(.latest_version)... Adding to next update."
+                  else "skipping available upgrade for in use integration - \($n)." end)
+            else empty end
+        ' "$INSTALLED_PACKAGE_LIST"
+
+        # (b) The bulk install list, built in a single pass.
+        jq "${JQ_ARGS[@]}" "$JQ_DECISION"'
+          {packages: [ .packages[] | select(needs($sub;$autoup;$excluded;$inuse)) | {name, version: .latest_version} ]}
+        ' "$INSTALLED_PACKAGE_LIST" > "$BULK_INSTALL_PACKAGE_LIST"
+
+        if jq -e '.packages | length > 0' "$BULK_INSTALL_PACKAGE_LIST" >/dev/null; then
+            PENDING_UPDATE=true
+        fi

        if [ "$PENDING_UPDATE" = true ]; then
            # Run chunked install of packages
@@ -8,18 +8,33 @@

 . /usr/sbin/so-elastic-fleet-common

+PKG_LOAD_FAILURES=0
+PKG_LOAD_FAILURES_NAMES=()
+
 {%- for PACKAGE in SUPPORTED_PACKAGES %}
 echo "Upgrading {{ PACKAGE }} package..."
 if VERSION=$(elastic_fleet_package_latest_version_check "{{ PACKAGE }}"); then
    if ! elastic_fleet_package_install "{{ PACKAGE }}" "$VERSION"; then
-        # exit 1 on failure to upgrade a default package, allow salt to handle retries
-        echo -e "\nERROR: Failed to upgrade $PACKAGE to version: $VERSION"
-        exit 1
+        PKG_LOAD_FAILURES=$((PKG_LOAD_FAILURES + 1))
+        PKG_LOAD_FAILURES_NAMES+=("{{ PACKAGE }}")
    fi
 else
-    echo -e "\nERROR: Failed to get version information for integration $PACKAGE"
+    PKG_LOAD_FAILURES=$((PKG_LOAD_FAILURES + 1))
+    PKG_LOAD_FAILURES_NAMES+=("{{ PACKAGE }}")
 fi
 echo
 {%- endfor %}
+
+if [ $PKG_LOAD_FAILURES -gt 0 ]; then
+    echo "ERROR: Failed to upgrade $PKG_LOAD_FAILURES package(s):"
+    for PKG in "${PKG_LOAD_FAILURES_NAMES[@]}"; do
+        echo " - $PKG"
+    done
+    # exit 1 on failure to upgrade a default package, allow salt to handle retries
+    exit 1
+else
+    echo "Successfully upgraded all packages."
+fi
+
 echo
 /usr/sbin/so-elasticsearch-templates-load
@@ -9,9 +9,12 @@
 {%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS, SO_MANAGED_INDICES %}
 {%   if GLOBALS.role != 'so-heavynode' %}
-{%     from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS %}
+{%     from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS, ADDON_INDICES %}
 {%   endif %}

+include:
+  - elasticsearch.enabled
+
 escomponenttemplates:
  file.recurse:
    - name: /opt/so/conf/elasticsearch/templates/component
@@ -35,6 +38,20 @@ so_index_template_dir:
      {%- endfor %}
    {%- endif %}

+{%  if GLOBALS.role != "so-heavynode" %}
+# Clean up legacy and non-SO managed templates from the elasticsearch/templates/addon-index/ directory
+addon_index_template_dir:
+  file.directory:
+    - name: /opt/so/conf/elasticsearch/templates/addon-index
+    - clean: True
+    {%- if ADDON_INDICES %}
+    - require:
+      {%- for index in ADDON_INDICES %}
+      - file: addon_index_template_{{index}}
+      {%- endfor %}
+    {%- endif %}
+{%  endif %}
+
 # Auto-generate index templates for SO managed indices (directly defined in elasticsearch/defaults.yaml)
 #   These index templates are for the core SO datasets and are always required
 {%  for index, settings in ES_INDEX_SETTINGS.items() %}
@@ -116,6 +133,18 @@ so-elasticsearch-templates:
      - docker_container: so-elasticsearch
      - file: elasticsearch_sbin_jinja

+so-elasticsearch-dlm-apply:
+  cmd.run:
+    - name: /usr/sbin/so-elasticsearch-dlm-apply
+    - cwd: /opt/so
+    - require:
+      - docker_container: so-elasticsearch
+      - file: elasticsearch_sbin_jinja
+      - cmd: so-elasticsearch-templates
+    - retry:
+        attempts: 3
+        interval: 10
+
 so-elasticsearch-pipelines:
  cmd.run:
    - name: /usr/sbin/so-elasticsearch-pipelines {{ GLOBALS.hostname }}
@@ -136,7 +165,8 @@ so-elasticsearch-roles-load:
 {%    set ap = "absent" %}
 {%  endif %}
 {%  if grains.role in ['so-eval', 'so-standalone', 'so-heavynode'] %}
-{%    if ELASTICSEARCHMERGED.index_clean %}
+{#    Remove so-elasticsearch-indices-delete script when using DLM #}
+{%    if ELASTICSEARCHMERGED.index_clean and ELASTICSEARCHMERGED.data_retention_method == "ILM" %}
 {%      set ap = "present" %}
 {%    else %}
 {%      set ap = "absent" %}
@@ -2,6 +2,7 @@ elasticsearch:
  enabled: false
  version: 9.3.3
  index_clean: true
+  data_retention_method: DLM
  vm:
    max_map_count: 1048576
  config:
@@ -63,6 +64,8 @@ elasticsearch:
            verification_mode: none
  index_settings:
    global_overrides:
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        template:
          settings:
@@ -143,6 +146,8 @@ elasticsearch:
                order: desc
    so-common:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -304,6 +309,8 @@ elasticsearch:
              number_of_shards: 1
    so-assistant-chat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: ""
      index_template:
        composed_of:
        - assistant-chat-mappings
@@ -344,6 +351,8 @@ elasticsearch:
            min_age: 0ms
    so-assistant-session:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: ""
      index_template:
        composed_of:
        - assistant-session-mappings
@@ -497,6 +506,8 @@ elasticsearch:
            min_age: 30d
    so-idh:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -605,6 +616,8 @@ elasticsearch:
            min_age: 30d
    so-import:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -787,6 +800,8 @@ elasticsearch:
            min_age: 0ms
    so-kismet:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - kismet-mappings
@@ -836,6 +851,8 @@ elasticsearch:
            min_age: 30d
    so-kratos:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -904,6 +921,8 @@ elasticsearch:
            min_age: 30d
    so-hydra:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -1049,6 +1068,8 @@ elasticsearch:
            min_age: 0ms
    so-logs:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - so-data-streams-mappings
@@ -1129,6 +1150,8 @@ elasticsearch:
            min_age: 30d
    so-logs-detections_x_alerts:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - so-data-streams-mappings
@@ -1192,6 +1215,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1307,6 +1332,8 @@ elasticsearch:
            min_age: 30d
    so-elastic-agent-monitor:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1369,6 +1396,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_apm_server:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.apm_server@package
@@ -1433,6 +1462,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_auditbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.auditbeat@package
@@ -1497,6 +1528,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_cloudbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.cloudbeat@package
@@ -1561,6 +1594,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_endpoint_security:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1620,6 +1655,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_filebeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1679,6 +1716,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_fleet_server:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1735,6 +1774,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_heartbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.heartbeat@package
@@ -1799,6 +1840,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_metricbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1858,6 +1901,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_osquerybeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -1917,6 +1962,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elastic_agent_x_packetbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elastic_agent.packetbeat@package
@@ -1981,6 +2028,8 @@ elasticsearch:
            min_age: 30d
    so-logs-elasticsearch_x_server:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-elasticsearch.server@package
@@ -2045,10 +2094,13 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_actions:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.actions@package
        - .logs-endpoint.actions@custom
+        - endpoint@custom
        - event-mappings
        - so-fleet_integrations.ip_mappings-1
        - so-fleet_globals-1
@@ -2058,8 +2110,9 @@ elasticsearch:
          hidden: false
        ignore_missing_component_templates:
        - .logs-endpoint.actions@custom
+        - endpoint@custom
        index_patterns:
-        - logs-endpoint.actions-*
+        - .logs-endpoint.actions-*
        priority: 501
        template:
          settings:
@@ -2104,10 +2157,13 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_action_x_responses:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.action.responses@package
        - .logs-endpoint.action.responses@custom
+        - endpoint@custom
        - event-mappings
        - so-fleet_integrations.ip_mappings-1
        - so-fleet_globals-1
@@ -2117,14 +2173,15 @@ elasticsearch:
          hidden: false
        ignore_missing_component_templates:
        - .logs-endpoint.action.responses@custom
+        - endpoint@custom
        index_patterns:
-        - logs-endpoint.action.responses-*
+        - .logs-endpoint.action.responses-*
        priority: 501
        template:
          settings:
            index:
              lifecycle:
-                name: so-logs-endpoint.actions-logs
+                name: so-logs-endpoint.action.responses-logs
              mapping:
                total_fields:
                  limit: 5000
@@ -2163,6 +2220,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_alerts:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.alerts@package
@@ -2222,6 +2281,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_diagnostic_x_collection:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.diagnostic.collection@package
@@ -2297,6 +2358,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_api:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.api@package
@@ -2356,6 +2419,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_file:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.file@package
@@ -2415,6 +2480,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_library:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.library@package
@@ -2474,6 +2541,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_network:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.network@package
@@ -2533,6 +2602,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_process:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.process@package
@@ -2592,6 +2663,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_registry:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.registry@package
@@ -2651,6 +2724,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_events_x_security:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-endpoint.events.security@package
@@ -2710,6 +2785,8 @@ elasticsearch:
            min_age: 30d
    so-logs-endpoint_x_heartbeat:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - .logs-endpoint.heartbeat@package
@@ -2769,6 +2846,8 @@ elasticsearch:
            min_age: 30d
    so-logs-http_endpoint_x_generic:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-http_endpoint.generic@package
@@ -2817,6 +2896,8 @@ elasticsearch:
            min_age: 30d
    so-logs-httpjson_x_generic:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-httpjson.generic@package
@@ -2882,6 +2963,8 @@ elasticsearch:
              number_of_replicas: 0
    so-logs-osquery-manager_x_action_x_responses:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        _meta:
          managed: true
@@ -2953,6 +3036,8 @@ elasticsearch:
              number_of_replicas: 0
    so-logs-osquery-manager_x_result:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        _meta:
          managed: true
@@ -3005,6 +3090,8 @@ elasticsearch:
            min_age: 30d
    so-logs-soc:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -3113,6 +3200,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_application:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3162,6 +3251,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_auth:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3211,6 +3302,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_security:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3260,6 +3353,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_syslog:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3309,6 +3404,8 @@ elasticsearch:
            min_age: 30d
    so-logs-system_x_system:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - event-mappings
@@ -3358,6 +3455,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_forwarded:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.forwarded@package
@@ -3405,6 +3504,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_powershell:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.powershell@package
@@ -3452,6 +3553,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_powershell_operational:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.powershell_operational@package
@@ -3499,6 +3602,8 @@ elasticsearch:
            min_age: 30d
    so-logs-windows_x_sysmon_operational:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-windows.sysmon_operational@package
@@ -3546,6 +3651,8 @@ elasticsearch:
            min_age: 30d
    so-logs-winlog_x_winlog:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - logs-winlog.winlog@package
@@ -3594,6 +3701,8 @@ elasticsearch:
            min_age: 30d
    so-logstash:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -3709,6 +3818,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-endpoint_x_metadata:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics-endpoint.metadata@package
@@ -3756,6 +3867,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-endpoint_x_metrics:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics-endpoint.metrics@package
@@ -3803,6 +3916,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-endpoint_x_policy:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics-endpoint.policy@package
@@ -3850,6 +3965,8 @@ elasticsearch:
            min_age: 30d
    so-metrics-fleet_server_x_agent_status:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics@tsdb-settings
@@ -3874,6 +3991,8 @@ elasticsearch:
              number_of_replicas: 0
    so-metrics-fleet_server_x_agent_versions:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - metrics@tsdb-settings
@@ -3898,6 +4017,8 @@ elasticsearch:
              number_of_replicas: 0
    so-redis:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4013,6 +4134,8 @@ elasticsearch:
            min_age: 30d
    so-strelka:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4130,6 +4253,8 @@ elasticsearch:
            min_age: 30d
    so-suricata:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4246,6 +4371,8 @@ elasticsearch:
            min_age: 30d
    so-suricata_x_alerts:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4362,6 +4489,8 @@ elasticsearch:
            min_age: 30d
    so-syslog:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -4478,6 +4607,8 @@ elasticsearch:
            min_age: 30d
    so-zeek:
      index_sorting: false
+      data_stream_lifecycle:
+        data_retention: 90d
      index_template:
        composed_of:
        - agent-mappings
@@ -24,6 +24,7 @@ include:
 so-elasticsearch:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elasticsearch:{{ ELASTICSEARCHMERGED.version }}
+    - restart_policy: unless-stopped
    - hostname: elasticsearch
    - name: so-elasticsearch
    - user: elasticsearch
@@ -63,7 +63,8 @@
    { "set":             { "if": "ctx.event?.dataset != null && !ctx.event.dataset.contains('.')", "field": "event.dataset", "value": "{{event.module}}.{{event.dataset}}" } },
    { "split":           { "if": "ctx.event?.dataset != null && ctx.event.dataset.contains('.')", "field": "event.dataset", "separator": "\\.", "target_field": "dataset_tag_temp" } },
    { "append":          { "if": "ctx.dataset_tag_temp != null", "field": "tags", "value": "{{dataset_tag_temp.1}}"  } },
-    { "grok":            { "if": "ctx.http?.response?.status_code != null", "field": "http.response.status_code", "patterns": ["%{NUMBER:http.response.status_code:long} %{GREEDYDATA}"]} },
+    { "grok":            { "if": "ctx.http?.response?.status_code instanceof String", "field": "http.response.status_code", "patterns": ["%{NUMBER:http.response.status_code:long}(?:\\s+%{GREEDYDATA})?"], "ignore_failure": true } },
+    { "convert":         { "if": "ctx.http?.response?.status_code != null && !(ctx.http.response.status_code instanceof Number)", "field": "http.response.status_code", "type": "long", "ignore_failure": true } },
    { "set":             { "if": "ctx?.metadata?.kafka != null" , "field": "kafka.id", "value": "{{metadata.kafka.partition}}{{metadata.kafka.offset}}{{metadata.kafka.timestamp}}", "ignore_failure": true } },
    { "remove":          { "field": [ "message2", "type", "fields", "category", "module", "dataset", "dataset_tag_temp", "event.dataset_temp" ], "ignore_missing": true, "ignore_failure": true } },
    { "pipeline": { "name": "global@custom", "ignore_missing_pipeline": true, "description": "[Fleet] Global pipeline for all data streams" } }
@@ -177,12 +177,84 @@
                "description": "Extract IPs from Elastic Agent events (host.ip) and adds them to related.ip"
            }
        },
+        {
+            "script": {
+                "description": "Snapshot event.ingested into _tmp.event_ingested_pre_fleet before .fleet_final_pipeline-1 overwrites it with ES ingest time",
+                "lang": "painless",
+                "if": "ctx.event?.ingested != null && ctx.event?.created == null",
+                "ignore_failure": true,
+                "source": "ctx.putIfAbsent('_tmp', [:]); ctx._tmp.event_ingested_pre_fleet = ctx.event.ingested;"
+            }
+        },
        {
            "pipeline": {
                "name": ".fleet_final_pipeline-1",
                "ignore_missing_pipeline": true
            }
        },
+        {
+            "script": {
+                "description": "Calculate time from Elastic Agent to Logstash.",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_from_agent != null",
+                "ignore_failure": true,
+                "source": "ZonedDateTime start = ctx._tmp.event_ingested_pre_fleet != null ? ZonedDateTime.parse(ctx._tmp.event_ingested_pre_fleet) : ZonedDateTime.parse(ctx['@timestamp']); ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_elasticagent_to_logstash = ChronoUnit.SECONDS.between(start, ZonedDateTime.parse(ctx._tmp.logstash_from_agent));"
+            }
+        },
+        {
+            "script": {
+                "description": "Calculate time from Logstash to Redis",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_from_agent != null && ctx._tmp?.logstash_to_redis != null",
+                "ignore_failure": true,
+                "source": "ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_logstash_to_redis = ChronoUnit.SECONDS.between(ZonedDateTime.parse(ctx._tmp.logstash_from_agent), ZonedDateTime.parse(ctx._tmp.logstash_to_redis));"
+            }
+        },
+        {
+            "script": {
+                "description": "Calculate time message spends in redis queue (logstash delay in pulling event).",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_to_redis != null && ctx._tmp?.logstash_from_redis != null",
+                "ignore_failure": true,
+                "source": "ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_redis_to_logstash = ChronoUnit.SECONDS.between(ZonedDateTime.parse(ctx._tmp.logstash_to_redis), ZonedDateTime.parse(ctx._tmp.logstash_from_redis));"
+            }
+        },
+        {
+            "script": {
+                "description": "Calculate time from Logstash to Elasticsearch (after read from Redis).",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_from_redis != null",
+                "ignore_failure": true,
+                "source": "ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_logstash_to_elasticsearch = ChronoUnit.SECONDS.between(ZonedDateTime.parse(ctx._tmp.logstash_from_redis), metadata().now);"
+            }
+        },
+        {
+            "script": {
+                "description": "Calculate time from Elastic Agent to Kafka.",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_from_kafka != null && ctx._tmp?.logstash_from_agent == null",
+                "ignore_failure": true,
+                "source": "ZonedDateTime start = ctx._tmp.event_ingested_pre_fleet != null ? ZonedDateTime.parse(ctx._tmp.event_ingested_pre_fleet) : ZonedDateTime.parse(ctx['@timestamp']); ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_elasticagent_to_kafka = ChronoUnit.SECONDS.between(start, ZonedDateTime.parse(ctx._tmp.logstash_from_kafka));"
+            }
+        },
+        {
+            "script": {
+                "description": "Calculate time message spends in Kafka queue (logstash delay in pulling event).",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_from_kafka != null && ctx.metadata?.kafka?.timestamp != null && ctx._tmp?.logstash_from_agent == null",
+                "ignore_failure": true,
+                "source": "ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_kafka_queue = ChronoUnit.SECONDS.between(ZonedDateTime.ofInstant(Instant.ofEpochMilli(Long.parseLong(ctx.metadata.kafka.timestamp.toString())), ZoneId.of('UTC')), ZonedDateTime.parse(ctx._tmp.logstash_from_kafka));"
+            }
+        },
+        {
+            "script": {
+                "description": "Calculate time from Logstash to Elasticsearch (after read from Kafka).",
+                "lang": "painless",
+                "if": "ctx._tmp?.logstash_from_kafka != null && ctx._tmp?.logstash_from_agent == null",
+                "ignore_failure": true,
+                "source": "ctx.event.putIfAbsent('ingestion', [:]); ctx.event.ingestion.latency_kafka_to_elasticsearch = ChronoUnit.SECONDS.between(ZonedDateTime.parse(ctx._tmp.logstash_from_kafka), metadata().now);"
+            }
+        },
        {
            "remove": {
                "field": "event.agent_id_status",
@@ -202,11 +274,12 @@
                    "event.dataset_temp",
                    "dataset_tag_temp",
                    "module_temp",
-                    "datastream_dataset_temp"
+                    "datastream_dataset_temp",
+                    "_tmp"
                ],
                "ignore_missing": true,
                "ignore_failure": true
            }
        }
    ]
-}
+}
@@ -0,0 +1,71 @@
+{
+    "description": "zeek.ja4d",
+    "processors": [
+        {
+            "set": {
+                "field": "event.dataset",
+                "value": "ja4d"
+            }
+        },
+        {
+            "remove": {
+                "field": [
+                    "host"
+                ],
+                "ignore_failure": true
+            }
+        },
+        {
+            "json": {
+                "field": "message",
+                "target_field": "message2",
+                "ignore_failure": true
+            }
+        },
+        {
+            "rename": {
+                "field": "message2.ja4d",
+                "target_field": "hash.ja4d",
+                "ignore_missing": true,
+                "if": "ctx?.message2?.ja4d != null && ctx.message2.ja4d.length() > 0"
+            }
+        },
+        {
+            "rename": {
+                "field": "message2.client_mac",
+                "target_field": "host.mac",
+                "ignore_missing": true,
+                "if": "ctx?.message2?.client_mac != null && ctx.message2.client_mac.length() > 0"
+            }
+        },
+        {
+            "rename": {
+                "field": "message2.hostname",
+                "target_field": "host.hostname",
+                "ignore_missing": true,
+                "if": "ctx?.message2?.hostname != null && ctx.message2.hostname.length() > 0"
+            }
+        },
+        {
+            "rename": {
+                "field": "message2.requested_ip",
+                "target_field": "dhcp.requested_address",
+                "ignore_missing": true,
+                "if": "ctx?.message2?.requested_ip != null && ctx.message2.requested_ip.length() > 0"
+            }
+        },
+        {
+            "rename": {
+                "field": "message2.vendor_class_id",
+                "target_field": "zeek.ja4d.vendor_class_id",
+                "ignore_missing": true,
+                "if": "ctx?.message2?.vendor_class_id != null && ctx.message2.vendor_class_id.length() > 0"
+            }
+        },
+        {
+            "pipeline": {
+                "name": "zeek.common"
+            }
+        }
+    ]
+}
@@ -4,6 +4,13 @@ elasticsearch:
    forcedType: bool
    advanced: True
    helpLink: elasticsearch
+  data_retention_method:
+    description: Method for data retention. Options are ILM or DLM. For single node deployments and most distributed grid users, DLM will be the recommended option for simplified management. Those with more complex use cases may prefer ILM. The latter allows for more granular control, but requires more management overhead.
+    options:
+    - ILM
+    - DLM
+    forcedType: string
+    global: True
  version:
    description: "This specifies the version of the following containers: so-elastic-fleet-package-registry, so-elastic-agent, so-elastic-fleet, so-kibana, so-logstash and so-elasticsearch. Modifying this value in the Elasticsearch defaults.yaml will result in catastrophic grid failure."
    readonly: True
@@ -13,7 +20,7 @@ elasticsearch:
    description: Specify the memory heap size in (m)egabytes for Elasticsearch.
    helpLink: elasticsearch
  index_clean:
-    description: Determines if indices should be considered for deletion by available disk space in the cluster. Otherwise, indices will only be deleted by the age defined in the ILM settings. This setting only applies to EVAL, STANDALONE, and HEAVY NODE installations. Other installations can only use ILM settings.
+    description: Determines if indices should be considered for deletion by available disk space in the cluster. Otherwise, data is retained by the configured lifecycle settings. This setting only applies to EVAL, STANDALONE, and HEAVY NODE installations. Other installations use lifecycle settings only.
    forcedType: bool
    helpLink: elasticsearch
  vm:
@@ -139,6 +146,23 @@ elasticsearch:
    custom010: *pipelines
  index_settings:
    global_overrides:
+      data_stream_lifecycle:
+        data_retention:
+          description: |
+            The retention period for all data streams. Retention does not define the period that the data will be removed, but the minimum time period they will be kept.
+
+            Use a number followed by a time unit, such as 7d. Leave blank for indefinite retention where supported.
+
+            Configured retention period also affects the frequency of rolling over data streams.
+              - If retention is less than or equal to 1 day, max_age will be 1 hour
+              - If retention is less than or equal to 14 days, max_age will be 1 day
+              - If retention is less than or equal to 90 days, max_age will be 7 days
+              - If retention is greater than 90 days, max_age will be 30 days
+          forcedType: string
+          allowedNodeTypes:
+            - heavynode
+          regex: ^$|^[0-9]{1,5}(?:d|h|m|s)$
+          regexFailureMessage: Must be blank or a number followed by d, h, m, or s, such as 7d.
      index_template:
        template:
          settings:
@@ -311,13 +335,30 @@ elasticsearch:
              forcedType: string
              global: True
              helpLink: elasticsearch
-    so-logs: &indexSettings
+    so-logs: &dataStreamSettings
      index_sorting:
        description: Sorts the index by event time, at the cost of additional processing resource consumption.
        forcedType: bool
        global: True
        advanced: True
        helpLink: elasticsearch
+      data_stream_lifecycle:
+        data_retention:
+          description: |
+            The retention period for this data stream. Retention does not define the period that the data will be removed, but the minimum time period it will be kept.
+
+            Use a number followed by a time unit, such as 7d. Leave blank for indefinite retention where supported.
+
+            Configured retention period also affects the frequency of rolling over this data stream.
+              - If retention is less than or equal to 1 day, max_age will be 1 hour
+              - If retention is less than or equal to 14 days, max_age will be 1 day
+              - If retention is less than or equal to 90 days, max_age will be 7 days
+              - If retention is greater than 90 days, max_age will be 30 days
+          forcedType: string
+          allowedNodeTypes:
+            - heavynode
+          regex: ^$|^[0-9]{1,5}(?:d|h|m|s)$
+          regexFailureMessage: Must be blank or a number followed by d, h, m, or s, such as 7d.
      index_template:
        index_patterns:
          description: Patterns for matching multiple indices or tables.
@@ -335,6 +376,14 @@ elasticsearch:
                global: True
                advanced: True
                helpLink: elasticsearch
+              auto_expand_replicas:
+                description: Automatically expand the number of replicas based on the number of data nodes in the cluster. This can help ensure high availability as the cluster scales up or down.
+                forcedType: string
+                regex: "^(0-[1-9]|1-[2-9]|2-[3-9]|3-[4-9]|4-[5-9]|5-[6-9]|6-[7-9]|7-[89]|8-9|[0-9]-all|false)$"
+                regexFailureMessage: Must be in the format of "x-y" where x is minimum number of replicas and y is maximum number of replicas, or "0-all" to specify a minimum of 0 and no maximum, or "false" to disable automatic replica expansion.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
              mapping:
                total_fields:
                  limit:
@@ -596,65 +645,349 @@ elasticsearch:
            global: True
            advanced: True
            helpLink: elasticsearch
-    so-logs-system_x_auth: *indexSettings
-    so-logs-system_x_syslog: *indexSettings
-    so-logs-system_x_system: *indexSettings
-    so-logs-system_x_application: *indexSettings
-    so-logs-system_x_security: *indexSettings
-    so-logs-windows_x_forwarded: *indexSettings
-    so-logs-windows_x_powershell: *indexSettings
-    so-logs-windows_x_powershell_operational: *indexSettings
-    so-logs-windows_x_sysmon_operational: *indexSettings
-    so-logs-winlog_x_winlog: *indexSettings
-    so-logs-detections_x_alerts: *indexSettings
-    so-logs-http_endpoint_x_generic: *indexSettings
-    so-logs-httpjson_x_generic: *indexSettings
-    so-logs-osquery-manager-actions: *indexSettings
-    so-logs-osquery-manager-action_x_responses: *indexSettings
-    so-logs-osquery-manager_x_action_x_responses: *indexSettings
-    so-logs-osquery-manager_x_result: *indexSettings
-    so-logs-elastic_agent_x_apm_server: *indexSettings
-    so-logs-elastic_agent_x_auditbeat: *indexSettings
-    so-logs-elastic_agent_x_cloudbeat: *indexSettings
-    so-logs-elastic_agent_x_endpoint_security: *indexSettings
-    so-logs-endpoint_x_alerts: *indexSettings
-    so-logs-endpoint_x_events_x_api: *indexSettings
-    so-logs-endpoint_x_events_x_file: *indexSettings
-    so-logs-endpoint_x_events_x_library: *indexSettings
-    so-logs-endpoint_x_events_x_network: *indexSettings
-    so-logs-endpoint_x_events_x_process: *indexSettings
-    so-logs-endpoint_x_events_x_registry: *indexSettings
-    so-logs-endpoint_x_events_x_security: *indexSettings
-    so-logs-elastic_agent_x_filebeat: *indexSettings
-    so-logs-elastic_agent_x_fleet_server: *indexSettings
-    so-logs-elastic_agent_x_heartbeat: *indexSettings
-    so-logs-elastic_agent: *indexSettings
-    so-logs-elastic_agent_x_metricbeat: *indexSettings
-    so-logs-elastic_agent_x_osquerybeat: *indexSettings
-    so-logs-elastic_agent_x_packetbeat: *indexSettings
-    so-logs-elasticsearch_x_server: *indexSettings
-    so-metrics-endpoint_x_metadata: *indexSettings
-    so-metrics-endpoint_x_metrics: *indexSettings
-    so-metrics-endpoint_x_policy: *indexSettings
-    so-metrics-nginx_x_stubstatus: *indexSettings
-    so-metrics-vsphere_x_datastore: *indexSettings
-    so-metrics-vsphere_x_host: *indexSettings
-    so-metrics-vsphere_x_virtualmachine: *indexSettings
-    so-case: *indexSettings
-    so-common: *indexSettings
-    so-endgame: *indexSettings
-    so-idh: *indexSettings
-    so-suricata: *indexSettings
-    so-suricata_x_alerts: *indexSettings
-    so-import: *indexSettings
-    so-kratos: *indexSettings
-    so-hydra: *indexSettings
-    so-kismet: *indexSettings
-    so-logstash: *indexSettings
-    so-redis: *indexSettings
-    so-strelka: *indexSettings
-    so-syslog: *indexSettings
-    so-zeek: *indexSettings
+    so-logs-system_x_auth: *dataStreamSettings
+    so-logs-system_x_syslog: *dataStreamSettings
+    so-logs-system_x_system: *dataStreamSettings
+    so-logs-system_x_application: *dataStreamSettings
+    so-logs-system_x_security: *dataStreamSettings
+    so-logs-windows_x_forwarded: *dataStreamSettings
+    so-logs-windows_x_powershell: *dataStreamSettings
+    so-logs-windows_x_powershell_operational: *dataStreamSettings
+    so-logs-windows_x_sysmon_operational: *dataStreamSettings
+    so-logs-winlog_x_winlog: *dataStreamSettings
+    so-logs-detections_x_alerts: *dataStreamSettings
+    so-logs-http_endpoint_x_generic: *dataStreamSettings
+    so-logs-httpjson_x_generic: *dataStreamSettings
+    so-logs-osquery-manager-actions: *dataStreamSettings
+    so-logs-osquery-manager-action_x_responses: *dataStreamSettings
+    so-logs-osquery-manager_x_action_x_responses: *dataStreamSettings
+    so-logs-osquery-manager_x_result: *dataStreamSettings
+    so-logs-elastic_agent_x_apm_server: *dataStreamSettings
+    so-logs-elastic_agent_x_auditbeat: *dataStreamSettings
+    so-logs-elastic_agent_x_cloudbeat: *dataStreamSettings
+    so-logs-elastic_agent_x_endpoint_security: *dataStreamSettings
+    so-logs-endpoint_x_alerts: *dataStreamSettings
+    so-logs-endpoint_x_events_x_api: *dataStreamSettings
+    so-logs-endpoint_x_events_x_file: *dataStreamSettings
+    so-logs-endpoint_x_events_x_library: *dataStreamSettings
+    so-logs-endpoint_x_events_x_network: *dataStreamSettings
+    so-logs-endpoint_x_events_x_process: *dataStreamSettings
+    so-logs-endpoint_x_events_x_registry: *dataStreamSettings
+    so-logs-endpoint_x_events_x_security: *dataStreamSettings
+    so-logs-elastic_agent_x_filebeat: *dataStreamSettings
+    so-logs-elastic_agent_x_fleet_server: *dataStreamSettings
+    so-logs-elastic_agent_x_heartbeat: *dataStreamSettings
+    so-logs-elastic_agent: *dataStreamSettings
+    so-logs-elastic_agent_x_metricbeat: *dataStreamSettings
+    so-logs-elastic_agent_x_osquerybeat: *dataStreamSettings
+    so-logs-elastic_agent_x_packetbeat: *dataStreamSettings
+    so-logs-elasticsearch_x_server: *dataStreamSettings
+    so-metrics-endpoint_x_metadata: *dataStreamSettings
+    so-metrics-endpoint_x_metrics: *dataStreamSettings
+    so-metrics-endpoint_x_policy: *dataStreamSettings
+    so-metrics-nginx_x_stubstatus: *dataStreamSettings
+    so-metrics-vsphere_x_datastore: *dataStreamSettings
+    so-metrics-vsphere_x_host: *dataStreamSettings
+    so-metrics-vsphere_x_virtualmachine: *dataStreamSettings
+    so-common: *dataStreamSettings
+    so-endgame: *dataStreamSettings
+    so-idh: *dataStreamSettings
+    so-suricata: *dataStreamSettings
+    so-suricata_x_alerts: *dataStreamSettings
+    so-import: *dataStreamSettings
+    so-kratos: *dataStreamSettings
+    so-hydra: *dataStreamSettings
+    so-kismet: *dataStreamSettings
+    so-logstash: *dataStreamSettings
+    so-redis: *dataStreamSettings
+    so-strelka: *dataStreamSettings
+    so-syslog: *dataStreamSettings
+    so-zeek: *dataStreamSettings
+    # Managed SOC integration annotations are inserted below this line. Referencing '*dataStreamSettings'
+    so-case: &indexSettings
+      index_sorting:
+        description: Sorts the index by event time, at the cost of additional processing resource consumption.
+        forcedType: bool
+        global: True
+        advanced: True
+        helpLink: elasticsearch
+      index_template:
+        index_patterns:
+          description: Patterns for matching multiple indices or tables.
+          forcedType: "[]string"
+          multiline: True
+          global: True
+          advanced: True
+          helpLink: elasticsearch
+        template:
+          settings:
+            index:
+              number_of_replicas:
+                description: Number of replicas required for this index. Multiple replicas protects against data loss, but also increases storage costs.
+                forcedType: int
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              auto_expand_replicas:
+                description: Automatically expand the number of replicas based on the number of data nodes in the cluster. This can help ensure high availability as the cluster scales up or down.
+                forcedType: string
+                regex: "^(0-[1-9]|1-[2-9]|2-[3-9]|3-[4-9]|4-[5-9]|5-[6-9]|6-[7-9]|7-[89]|8-9|[0-9]-all|false)$"
+                regexFailureMessage: Must be in the format of "x-y" where x is minimum number of replicas and y is maximum number of replicas, or "0-all" to specify a minimum of 0 and no maximum, or "false" to disable automatic replica expansion.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              mapping:
+                total_fields:
+                  limit:
+                    description: Max number of fields that can exist on a single index. Larger values will consume more resources.
+                    global: True
+                    advanced: True
+                    helpLink: elasticsearch
+              refresh_interval:
+                description: Seconds between index refreshes. Shorter intervals can cause query performance to suffer since this is a synchronous and resource-intensive operation.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              number_of_shards:
+                description: Number of shards required for this index. Using multiple shards increases fault tolerance, but also increases storage and network costs.
+                global: True
+                advanced: True
+                helpLink: elasticsearch
+              sort:
+                field:
+                  description: The field to sort by. Must set index_sorting to True.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+                order:
+                  description: The order to sort by. Must set index_sorting to True.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+          mappings:
+            _meta:
+              package:
+                name:
+                  description: Meta settings for the mapping.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              managed_by:
+                  description: Meta settings for the mapping.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              managed:
+                  description: Meta settings for the mapping.
+                  forcedType: bool
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+        composed_of:
+          description: The index template is composed of these component templates.
+          forcedType: "[]string"
+          global: True
+          advanced: True
+          helpLink: elasticsearch
+        priority:
+          description: The priority of the index template.
+          forcedType: int
+          global: True
+          advanced: True
+          helpLink: elasticsearch
+      policy:
+        phases:
+          hot:
+            min_age:
+              description: Minimum age of index. This determines when the index should be moved to the hot tier.
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+            actions:
+              set_priority:
+                priority:
+                  description: Priority of index. This is used for recovery after a node restart. Indices with higher priorities are recovered before indices with lower priorities.
+                  forcedType: int
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              rollover:
+                max_age:
+                  description: Maximum age of index.  Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+                max_primary_shard_size:
+                  description: Maximum primary shard size. Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              shrink:
+                method:
+                  description: Shrink the index to a new index with fewer primary shards. Shrink operation is by count or size.
+                  options:
+                  - COUNT
+                  - SIZE
+                  global: True
+                  advanced: True
+                  forcedType: string
+                number_of_shards:
+                  title: shard count
+                  description: Desired shard count. Note that this value is only used when the shrink method selected is 'COUNT'.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                max_primary_shard_size:
+                  title: max shard size
+                  description: Desired shard size in gb/tb/pb eg. 100gb. Note that this value is only used when the shrink method selected is 'SIZE'.
+                  regex: ^[0-9]+(?:gb|tb|pb)$
+                  global: True
+                  forcedType: string
+                  advanced: True
+                allow_write_after_shrink:
+                  description: Allow writes after shrink.
+                  global: True
+                  forcedType: bool
+                  default: False
+                  advanced: True
+              forcemerge:
+                max_num_segments:
+                  description: Reduce the number of segments in each index shard and clean up deleted documents.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                index_codec:
+                  title: compression
+                  description: Use higher compression for stored fields at the cost of slower performance.
+                  forcedType: bool
+                  global: True
+                  default: False
+                  advanced: True
+          warm:
+            min_age:
+              description: Minimum age of index. ex. 30d - This determines when the index should be moved to the warm tier. Nodes in the warm tier generally don’t need to be as fast as those in the hot tier. It’s important to note that this is calculated relative to the rollover date (NOT the original creation date of the index). For example, if you have an index that is set to rollover after 30 days and warm min_age set to 30 then there will be 30 days from index creation to rollover and then an additional 30 days before moving to warm tier.
+              regex: ^[0-9]{1,5}d$
+              forcedType: string
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+            actions:
+              set_priority:
+                priority:
+                  description: Priority of index. This is used for recovery after a node restart. Indices with higher priorities are recovered before indices with lower priorities.
+                  forcedType: int
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              rollover:
+                max_age:
+                  description: Maximum age of index.  Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+                max_primary_shard_size:
+                  description: Maximum primary shard size. Once an index reaches this limit, it will be rolled over into a new index.
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              shrink:
+                method:
+                  description: Shrink the index to a new index with fewer primary shards. Shrink operation is by count or size.
+                  options:
+                  - COUNT
+                  - SIZE
+                  global: True
+                  advanced: True
+                number_of_shards:
+                  title: shard count
+                  description: Desired shard count. Note that this value is only used when the shrink method selected is 'COUNT'.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                max_primary_shard_size:
+                  title: max shard size
+                  description: Desired shard size in gb/tb/pb eg. 100gb. Note that this value is only used when the shrink method selected is 'SIZE'.
+                  regex: ^[0-9]+(?:gb|tb|pb)$
+                  global: True
+                  forcedType: string
+                  advanced: True
+                allow_write_after_shrink:
+                  description: Allow writes after shrink.
+                  global: True
+                  forcedType: bool
+                  default: False
+                  advanced: True
+              forcemerge:
+                max_num_segments:
+                  description: Reduce the number of segments in each index shard and clean up deleted documents.
+                  global: True
+                  forcedType: int
+                  advanced: True
+                index_codec:
+                  title: compression
+                  description: Use higher compression for stored fields at the cost of slower performance.
+                  forcedType: bool
+                  global: True
+                  default: False
+                  advanced: True
+              allocate:
+                number_of_replicas:
+                  description: Set the number of replicas. Remains the same as the previous phase by default.
+                  forcedType: int
+                  global: True
+                  advanced: True
+          cold:
+            min_age:
+              description: Minimum age of index. ex. 60d - This determines when the index should be moved to the cold tier.  While still searchable, this tier is typically optimized for lower storage costs rather than search speed. It’s important to note that this is calculated relative to the rollover date (NOT the original creation date of the index). For example, if you have an index that is set to rollover after 30 days and cold min_age set to 60 then there will be 30 days from index creation to rollover and then an additional 60 days before moving to cold tier.
+              regex: ^[0-9]{1,5}d$
+              forcedType: string
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+            actions:
+              set_priority:
+                priority:
+                  description: Used for index recovery after a node restart. Indices with higher priorities are recovered before indices with lower priorities.
+                  forcedType: int
+                  global: True
+                  advanced: True
+                  helpLink: elasticsearch
+              allocate:
+                number_of_replicas:
+                  description: Set the number of replicas. Remains the same as the previous phase by default.
+                  forcedType: int
+                  global: True
+                  advanced: True
+          delete:
+            min_age:
+              description: Minimum age of index. ex. 90d - This determines when the index should be deleted. It’s important to note that this is calculated relative to the rollover date (NOT the original creation date of the index). For example, if you have an index that is set to rollover after 30 days and delete min_age set to 90 then there will be 30 days from index creation to rollover and then an additional 90 days before deletion.
+              regex: ^[0-9]{1,5}d$
+              forcedType: string
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+        _meta:
+          package:
+            name:
+              description: Meta settings for the mapping.
+              global: True
+              advanced: True
+              helpLink: elasticsearch
+          managed_by:
+            description: Meta settings for the mapping.
+            global: True
+            advanced: True
+            helpLink: elasticsearch
+          managed:
+            description: Meta settings for the mapping.
+            forcedType: bool
+            global: True
+            advanced: True
+            helpLink: elasticsearch
+    sos-backup: *indexSettings
+    so-detection: *indexSettings
+    so-assistant-chat: *indexSettings
+    so-assistant-session: *indexSettings
    so-metrics-fleet_server_x_agent_status: &fleetMetricsSettings
      index_sorting:
        description: Sorts the index by event time, at the cost of additional processing resource consumption.
@@ -4,7 +4,11 @@
   Elastic License 2.0. #}

 {% import_yaml 'elasticsearch/defaults.yaml' as ELASTICSEARCHDEFAULTS %}
+{# ELASTICSEARCHMERGED only used here to collect data_retention_method. This file intentionally works with ELASTICSEARCHDEFAULTS #}
+{% from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
+
 {% set DEFAULT_GLOBAL_OVERRIDES = ELASTICSEARCHDEFAULTS.elasticsearch.index_settings.pop('global_overrides') %}
+{% set DATA_RETENTION_METHOD = ELASTICSEARCHMERGED.data_retention_method %}

 {% set PILLAR_GLOBAL_OVERRIDES = {} %}
 {% set ES_INDEX_PILLAR = salt['pillar.get']('elasticsearch:index_settings', {}) %}
@@ -61,15 +65,25 @@
 {% if ALL_ADDON_SETTINGS_ORIG.keys() | length > 0 %}
 {%   for index in ALL_ADDON_SETTINGS_ORIG.keys() %}
 {%     do ALL_ADDON_SETTINGS_GLOBAL_OVERRIDES.update({index: salt['defaults.merge'](ALL_ADDON_SETTINGS_ORIG[index], PILLAR_GLOBAL_OVERRIDES, in_place=False)}) %}
+{#     Explicitly excluding addon indices from ES_INDEX_SETTINGS_ORIG
+         When manager.soc_managed_annotations runs, new entries are added to the salt/elasticsearch/defaults.yaml file to support 'revert to default' functionality.
+         Subsequent map renders will then incorrectly include 'integration X' in 'ES_INDEX_SETTINGS_ORIG' due to being in the defaults.yaml file. #}
+{%     if index in ES_INDEX_SETTINGS_ORIG.keys() %}
+{%       do ES_INDEX_SETTINGS_ORIG.pop(index) %}
+{%     endif %}
 {%   endfor %}
 {% endif %}

 {% set ES_INDEX_SETTINGS = {} %}
-{% macro create_final_index_template(DEFINED_SETTINGS, GLOBAL_OVERRIDES, FINAL_INDEX_SETTINGS) %}
+{% macro create_final_index_template(DEFINED_SETTINGS, GLOBAL_OVERRIDES, FINAL_INDEX_SETTINGS, EXCLUDE_INDICES=[]) %}

 {% do GLOBAL_OVERRIDES.update(salt['defaults.merge'](GLOBAL_OVERRIDES, ES_INDEX_PILLAR, in_place=False)) %}
 {% for index, settings in GLOBAL_OVERRIDES.items() %}

+{%   if index in EXCLUDE_INDICES %}
+{%     continue %}
+{%   endif %}
+
 {#   prevent this action from being performed on custom defined indices. #}
 {#   the custom defined index is not present in either of the dictionaries and fails to reder. #}
 {%   if index in DEFINED_SETTINGS and index in GLOBAL_OVERRIDES %}
@@ -95,6 +109,17 @@
 {%     if not settings.get('index_sorting', False) | to_bool and settings.index_template.template.settings.index.sort is defined %}
 {%       do settings.index_template.template.settings.index.pop('sort') %}
 {%     endif %}
+{%     if DATA_RETENTION_METHOD == 'DLM' and settings.index_template.data_stream is defined and settings.data_stream_lifecycle is defined %}
+{%       if settings.data_stream_lifecycle.data_retention is defined and settings.data_stream_lifecycle.data_retention %}
+{%         do settings.index_template.template.update({'lifecycle': {'data_retention': settings.data_stream_lifecycle.data_retention}}) %}
+{%       else %}
+{%         do settings.index_template.template.update({'lifecycle': {}}) %}
+{%       endif %}
+{%       if settings.index_template.template.settings.index.lifecycle is not defined %}
+{%         do settings.index_template.template.settings.index.update({'lifecycle': {}}) %}
+{%       endif %}
+{%       do settings.index_template.template.settings.index.lifecycle.update({'prefer_ilm': false}) %}
+{%     endif %}
 {%   endif %}

 {# advanced ilm actions #}
@@ -150,10 +175,19 @@
 {% endfor %}
 {% endmacro %}

-{{ create_final_index_template(ES_INDEX_SETTINGS_ORIG, ES_INDEX_SETTINGS_GLOBAL_OVERRIDES, ES_INDEX_SETTINGS) }}
-{{ create_final_index_template(ALL_ADDON_SETTINGS_ORIG, ALL_ADDON_SETTINGS_GLOBAL_OVERRIDES, ALL_ADDON_SETTINGS) }}
+{# Exclude addon integrations from final ES_INDEX_SETTINGS #}
+{{ create_final_index_template(ES_INDEX_SETTINGS_ORIG, ES_INDEX_SETTINGS_GLOBAL_OVERRIDES, ES_INDEX_SETTINGS, ALL_ADDON_SETTINGS_ORIG.keys() | list ) }}
+
+{# Exclude SO managed indices, otherwise ALL_ADDON_SETTINGS will include pillar values
+  of core integrations without merging defaults, resulting in an overlapping, but bad index template being generated. #}
+{{ create_final_index_template(ALL_ADDON_SETTINGS_ORIG, ALL_ADDON_SETTINGS_GLOBAL_OVERRIDES, ALL_ADDON_SETTINGS, ES_INDEX_SETTINGS_ORIG.keys() | list ) }}

 {% set SO_MANAGED_INDICES = [] %}
 {% for index, settings in ES_INDEX_SETTINGS.items() %}
 {%   do SO_MANAGED_INDICES.append(index) %}
-{% endfor %}
+{% endfor %}
+
+{% set ADDON_INDICES = [] %}
+{% for index, settings in ALL_ADDON_SETTINGS.items() %}
+{%   do ADDON_INDICES.append(index) %}
+{% endfor %}
@@ -11,10 +11,8 @@ ADDON_STATEFILE_SUCCESS=/opt/so/state/addon_estemplates.txt
 ELASTICSEARCH_TEMPLATES_DIR="/opt/so/conf/elasticsearch/templates"
 SO_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR}/index"
 ADDON_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR}/addon-index"
-SO_LOAD_FAILURES=0
-ADDON_LOAD_FAILURES=0
-SO_LOAD_FAILURES_NAMES=()
-ADDON_LOAD_FAILURES_NAMES=()
+FAILED_NAMES=()
+FAILED_COUNT=0
 IS_HEAVYNODE="false"
 FORCE="false"
 VERBOSE="false"
@@ -46,20 +44,86 @@ while [[ $# -gt 0 ]]; do
    shift
 done

+# Max number of concurrent template PUT jobs. Override via env if needed.
+MAX_TEMPLATE_JOBS=${MAX_TEMPLATE_JOBS:-10}
+
+# Block until fewer than MAX_TEMPLATE_JOBS background jobs are running.
+template_throttle() {
+    while (( $(jobs -rp | wc -l) >= MAX_TEMPLATE_JOBS )); do
+        wait -n
+    done
+}
+
+# Per-job failure markers and an output lock for serializing parallel job output.
+# Each failed load drops one file (named after the template) into FAIL_DIR; the
+# output of each job is flushed as a single block under flock so concurrent jobs
+# never interleave their (chatty) retry output.
+FAIL_DIR=$(mktemp -d)
+OUTPUT_LOCK="${FAIL_DIR}/.output.lock"
+: > "$OUTPUT_LOCK"
+trap 'rm -rf "$FAIL_DIR"' EXIT
+
+# Record a failure: $1 = the template name/path to report later. Slashes are
+# encoded so the path becomes a safe single filename.
+record_failure() {
+    local marker="${1//\//__}"
+    : > "${FAIL_DIR}/fail.${marker}"
+}
+
+# Populate FAILED_NAMES and FAILED_COUNT from the current phase's markers.
+# Must run in the current shell (not a command substitution) so the array sticks.
+collect_failures() {
+    FAILED_NAMES=()
+    FAILED_COUNT=0
+    local f name
+    shopt -s nullglob
+    for f in "${FAIL_DIR}"/fail.*; do
+        name="${f##*/fail.}"
+        name="${name//__//}"
+        FAILED_NAMES+=("$name")
+        FAILED_COUNT=$((FAILED_COUNT + 1))
+    done
+    shopt -u nullglob
+}
+
+# Clear markers and names between phases so SO and addon counts stay independent.
+reset_failures() {
+    shopt -s nullglob
+    rm -f "${FAIL_DIR}"/fail.*
+    shopt -u nullglob
+    FAILED_NAMES=()
+    FAILED_COUNT=0
+}
+
+# Print a block of text atomically (under the shared output lock) so the output
+# of concurrent background jobs is not interleaved.
+locked_echo() {
+    { flock 9; printf '%s\n' "$1"; } 9>>"$OUTPUT_LOCK"
+}
+
+# Loads one template file via PUT. Intended to be dispatched as a background job.
+#   $1 uri          - e.g. _component_template/foo or _index_template/foo
+#   $2 file         - path to the template JSON
+#   $3 report_name  - name/path to record if this load fails
 load_template() {
    local uri="$1"
    local file="$2"
+    local report_name="$3"
+    local out rc=0 block

-    echo "Loading template file $file"
-    if ! output=$(retry 3 3 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}"); then
-        echo "$output"
-
-        return 1
-
+    # Capture everything (including retry's diagnostic chatter) into one block so
+    # concurrent jobs never interleave; the whole block is flushed under one flock.
+    block="Loading template file $file"$'\n'
+    if ! out=$(retry 3 3 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}" 2>&1); then
+        block+="$out"$'\n'
+        rc=1
    elif [[ "$VERBOSE" == "true" ]]; then
-        echo "$output"
+        block+="$out"$'\n'
    fi

+    { flock 9; printf '%s' "$block"; } 9>>"$OUTPUT_LOCK"
+
+    (( rc != 0 )) && record_failure "$report_name"
 }

 check_required_component_template_exists() {
@@ -110,6 +174,9 @@ load_component_templates() {
        return
    fi

+    # Dispatch loads as throttled background jobs. The barrier (wait) happens in
+    # the caller after all component groups have been dispatched, since index
+    # templates must not load until every component template is in place.
    for component in "$pattern"/*.json; do
        tmpl_name=$(basename "${component%.json}")

@@ -118,21 +185,11 @@ load_component_templates() {
            tmpl_name="${tmpl_name%-mappings}-mappings"
        fi

-        if ! load_template "_component_template/${tmpl_name}" "$component"; then
-            SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
-            SO_LOAD_FAILURES_NAMES+=("$component")
-        fi
+        template_throttle
+        load_template "_component_template/${tmpl_name}" "$component" "$component" &
    done
 }

-check_elasticsearch_responsive() {
-    # Cannot load templates if Elasticsearch is not responding.
-    #  NOTE: Slightly faster exit w/ failure than previous "retry 240 1" if there is a problem with Elasticsearch the
-    #    script should exit sooner rather than hang at the 'so-elasticsearch-templates' salt state.
-    retry 3 15 "so-elasticsearch-query / --output /dev/null --fail" ||
-        fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
-}
-
 index_templates_exist() {
    local templates_dir="$1"

@@ -180,6 +237,9 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
    load_component_templates "Elastic Agent" "elastic-agent"
    load_component_templates "Security Onion" "so"

+    # Barrier: every component template PUT must complete before we snapshot the
+    # component template list and start loading index templates that depend on them.
+    wait
    component_templates=$(so-elasticsearch-component-templates-list)
    echo -e "Loading Security Onion index templates...\n"
    for so_idx_tmpl in "${SO_TEMPLATES_DIR}"/*.json; do
@@ -189,7 +249,7 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
            # TODO: Better way to load only heavynode specific templates
            if ! check_heavynode_compatiable_index_template "$tmpl_name"; then
                if [[ "$VERBOSE" == "true" ]]; then
-                    echo "Skipping over $so_idx_tmpl, template is not a heavynode specific index template."
+                    locked_echo "Skipping over $so_idx_tmpl, template is not a heavynode specific index template."
                fi

                continue
@@ -197,32 +257,34 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
        fi

        if check_required_component_template_exists "$so_idx_tmpl"; then
-            if ! load_template "_index_template/$tmpl_name" "$so_idx_tmpl"; then
-                SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
-                SO_LOAD_FAILURES_NAMES+=("$so_idx_tmpl")
-            fi
+            template_throttle
+            load_template "_index_template/$tmpl_name" "$so_idx_tmpl" "$so_idx_tmpl" &
        else
-            echo "Skipping over $so_idx_tmpl due to missing required component template(s)."
-            SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
-            SO_LOAD_FAILURES_NAMES+=("$so_idx_tmpl")
+            locked_echo "Skipping over $so_idx_tmpl due to missing required component template(s)."
+            record_failure "$so_idx_tmpl"

            continue
        fi
    done

-    if [[ $SO_LOAD_FAILURES -eq 0 ]]; then
+    # Barrier: all SO index template PUTs must finish before tallying failures.
+    wait
+
+    collect_failures
+    if [[ $FAILED_COUNT -eq 0 ]]; then
        echo "All Security Onion core templates loaded successfully."

        touch "$SO_STATEFILE_SUCCESS"
    else
-        echo "Encountered $SO_LOAD_FAILURES failure(s) loading templates:"
-        for failed_template in "${SO_LOAD_FAILURES_NAMES[@]}"; do
+        echo "Encountered $FAILED_COUNT failure(s) loading templates:"
+        for failed_template in "${FAILED_NAMES[@]}"; do
            echo "  - $failed_template"
        done
        if [[ "$SHOULD_EXIT_ON_FAILURE" == "true" ]]; then
            fail "Failed to load all Security Onion core templates successfully."
        fi
    fi
+    reset_failures
 elif ! index_templates_exist "$SO_TEMPLATES_DIR"; then
    echo "No Security Onion core index templates found in ${SO_TEMPLATES_DIR}, skipping."
 elif [[ -f "$SO_STATEFILE_SUCCESS" ]]; then
@@ -241,26 +303,27 @@ if should_load_addon_templates; then
        tmpl_name=$(basename "${addon_idx_tmpl%-template.json}")

        if check_required_component_template_exists "$addon_idx_tmpl"; then
-            if ! load_template "_index_template/${tmpl_name}" "$addon_idx_tmpl"; then
-                ADDON_LOAD_FAILURES=$((ADDON_LOAD_FAILURES + 1))
-                ADDON_LOAD_FAILURES_NAMES+=("$addon_idx_tmpl")
-            fi
+            template_throttle
+            load_template "_index_template/${tmpl_name}" "$addon_idx_tmpl" "$addon_idx_tmpl" &
        else
-            echo "Skipping over $addon_idx_tmpl due to missing required component template(s)."
-            ADDON_LOAD_FAILURES=$((ADDON_LOAD_FAILURES + 1))
-            ADDON_LOAD_FAILURES_NAMES+=("$addon_idx_tmpl")
+            locked_echo "Skipping over $addon_idx_tmpl due to missing required component template(s)."
+            record_failure "$addon_idx_tmpl"

            continue
        fi
    done

-    if [[ $ADDON_LOAD_FAILURES -eq 0 ]]; then
+    # Barrier: all addon index template PUTs must finish before tallying failures.
+    wait
+
+    collect_failures
+    if [[ $FAILED_COUNT -eq 0 ]]; then
        echo "All addon integration templates loaded successfully."

        touch "$ADDON_STATEFILE_SUCCESS"
    else
-        echo "Encountered $ADDON_LOAD_FAILURES failure(s) loading addon integration templates:"
-        for failed_template in "${ADDON_LOAD_FAILURES_NAMES[@]}"; do
+        echo "Encountered $FAILED_COUNT failure(s) loading addon integration templates:"
+        for failed_template in "${FAILED_NAMES[@]}"; do
            echo "  - $failed_template"
        done
        if [[ "$SHOULD_EXIT_ON_FAILURE" == "true" ]]; then
@@ -0,0 +1,175 @@
+#!/bin/bash
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+{%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
+{%- set DATA_RETENTION_METHOD = ELASTICSEARCHMERGED.data_retention_method %}
+
+ELASTICSEARCH_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR:-/opt/so/conf/elasticsearch/templates}"
+TEMPLATE_DIRS=(
+    "${ELASTICSEARCH_TEMPLATES_DIR}/index"
+    "${ELASTICSEARCH_TEMPLATES_DIR}/addon-index"
+)
+DATA_RETENTION_METHOD=$(cat <<'EOF'
+{{ DATA_RETENTION_METHOD }}
+EOF
+)
+DLM_FAILURES=0
+DLM_FAILURE_NAMES=()
+
+if [[ "$DATA_RETENTION_METHOD" != "DLM" && "$DATA_RETENTION_METHOD" != "ILM" ]]; then
+    echo "Unsupported data retention method $DATA_RETENTION_METHOD. Expected DLM or ILM."
+    exit 1
+fi
+
+validate_template_file() {
+    local template_file="$1"
+
+    if ! jq -e 'type == "object" and (.data_stream == null or (.data_stream | type == "object")) and (.template.lifecycle == null or (.template.lifecycle | type == "object")) and (.template.lifecycle.data_retention == null or (.template.lifecycle.data_retention | type == "string"))' >/dev/null 2>&1 "$template_file"; then
+        echo "Invalid index template JSON: $template_file"
+        return 1
+    fi
+}
+
+is_data_stream_template() {
+    jq -e '.data_stream | type == "object"' >/dev/null 2>&1 "$1"
+}
+
+has_data_stream_lifecycle() {
+    jq -e '.template.lifecycle | type == "object"' >/dev/null 2>&1 "$1"
+}
+
+get_data_retention() {
+    jq -r '.template.lifecycle.data_retention // ""' "$1"
+}
+
+find_template_file() {
+    local template="$1"
+    local template_dir
+    local template_file
+
+    for template_dir in "${TEMPLATE_DIRS[@]}"; do
+        template_file="${template_dir}/${template}-template.json"
+
+        if [[ -f "$template_file" ]]; then
+            echo "$template_file"
+            return 0
+        fi
+    done
+
+    return 1
+}
+
+set_data_stream_lifecycle() {
+    local data_stream="$1"
+    local data_retention="$2"
+    local body
+    local output
+
+    if [[ -n "$data_retention" ]]; then
+        if jq -e --arg data_stream "$data_stream" --arg data_retention "$data_retention" '.data_streams[]? | select(.name == $data_stream and .lifecycle.enabled == true and .lifecycle.data_retention == $data_retention)' >/dev/null 2>&1 <<< "$data_streams"; then
+            echo "DLM lifecycle already set for $data_stream with data_retention $data_retention, skipping."
+            return 0
+        fi
+    elif jq -e --arg data_stream "$data_stream" '.data_streams[]? | select(.name == $data_stream and .lifecycle.enabled == true and (.lifecycle.data_retention == null))' >/dev/null 2>&1 <<< "$data_streams"; then
+        echo "DLM lifecycle already set for $data_stream with indefinite retention, skipping."
+        return 0
+    fi
+
+    if [[ -n "$data_retention" ]]; then
+        body=$(jq -cn --arg data_retention "$data_retention" '{data_retention: $data_retention}')
+    else
+        # Setting indefinite retention
+        body='{}'
+    fi
+
+    if ! output=$(so-elasticsearch-query "_data_stream/${data_stream}/_lifecycle" -XPUT -d "$body" --retry 3 --retry-delay 5 --fail); then
+        echo "Failed to set data stream lifecycle for $data_stream."
+        return 1
+    fi
+
+    if [[ -n "$data_retention" ]]; then
+        echo "Set DLM lifecycle for $data_stream with data_retention $data_retention."
+    else
+        echo "Set DLM lifecycle for $data_stream with indefinite retention."
+    fi
+}
+
+disable_data_stream_lifecycle() {
+    local data_stream="$1"
+    local body='{"enabled":false}'
+    local output
+
+    if ! jq -e --arg data_stream "$data_stream" '.data_streams[]? | select(.name == $data_stream and .lifecycle != null and .lifecycle.enabled != false)' >/dev/null 2>&1 <<< "$data_streams"; then
+        # No action needed
+        return 0
+    fi
+
+    if ! output=$(so-elasticsearch-query "_data_stream/${data_stream}/_lifecycle" -XPUT -d "$body" --retry 3 --retry-delay 5 --fail); then
+        echo "Failed to disable data stream lifecycle for $data_stream."
+        return 1
+    fi
+
+    echo "Disabled DLM lifecycle for $data_stream."
+}
+
+process_data_stream() {
+    local data_stream="$1"
+    local data_retention="$2"
+
+    if [[ "$DATA_RETENTION_METHOD" == "DLM" ]]; then
+        set_data_stream_lifecycle "$data_stream" "$data_retention"
+    else
+        disable_data_stream_lifecycle "$data_stream"
+    fi
+}
+
+check_elasticsearch_responsive
+
+if ! data_streams=$(so-elasticsearch-query "_data_stream?format=json" --retry 3 --retry-delay 5 --fail); then
+    echo "Failed to retrieve data streams."
+    exit 1
+fi
+
+while read -r data_stream_config; do
+    data_stream=$(jq -r '.name' <<< "$data_stream_config")
+    template=$(jq -r '.template' <<< "$data_stream_config")
+
+    if ! template_file=$(find_template_file "$template"); then
+        echo "Skipping $data_stream: index template file not found for $template."
+        continue
+    fi
+
+    validate_template_file "$template_file" || exit 1
+
+    if ! is_data_stream_template "$template_file"; then
+        echo "Skipping $data_stream: $template_file is not a data stream template."
+        continue
+    fi
+
+    if [[ "$DATA_RETENTION_METHOD" == "DLM" ]] && ! has_data_stream_lifecycle "$template_file"; then
+        echo "Skipping $data_stream: $template_file does not define data stream lifecycle."
+        continue
+    fi
+
+    data_retention=$(get_data_retention "$template_file")
+
+    if ! process_data_stream "$data_stream" "$data_retention"; then
+        DLM_FAILURES=$((DLM_FAILURES + 1))
+        DLM_FAILURE_NAMES+=("$data_stream")
+    fi
+done < <(jq -c '.data_streams[]' <<< "$data_streams")
+
+if [[ $DLM_FAILURES -eq 0 ]]; then
+    echo "Data stream lifecycle updates completed successfully."
+else
+    echo "Encountered $DLM_FAILURES failure(s) updating data stream lifecycle:"
+    for failed_data_stream in "${DLM_FAILURE_NAMES[@]}"; do
+        echo "  - $failed_data_stream"
+    done
+    exit 1
+fi
@@ -6,6 +6,48 @@

 . /usr/sbin/so-common

+MAX_JOBS=${MAX_ILM_JOBS:-10}
+
+# Lock used to serialize block writes so concurrent jobs never interleave their output.
+ILM_OUTPUT_LOCK=$(mktemp)
+ILM_FAIL_FILE=$(mktemp)
+trap 'rm -f "$ILM_OUTPUT_LOCK" "$ILM_FAIL_FILE"' EXIT
+
+# Policies are loaded concurrently (up to MAX_JOBS at a time) for speed. Each policy's block is
+# printed the moment its curl returns, so output appears in COMPLETION ORDER, not the order
+# policies are defined in configuration.
+echo "Loading ILM policies concurrently; output below appears in completion order, not configuration order."
+echo
+
+put_policy() {
+  local desc="$1" policyname="$2" data="$3" result rc=0
+  if ! result=$(curl -K /opt/so/conf/elasticsearch/curl.config -s -k -L --fail \
+    -X PUT "https://localhost:9200/_ilm/policy/${policyname}" \
+    -H 'Content-Type: application/json' -d"${data}" 2>&1); then
+    rc=1
+  elif ! jq -e '.acknowledged == true' <<<"$result" >/dev/null 2>&1; then
+    rc=1
+  fi
+
+  # curl above ran in parallel; serialize just this block write so concurrent jobs never interleave.
+  {
+    flock 200
+    printf 'Setting up %s policy...\n%s\n\n' "${desc}" "${result}"
+    if (( rc != 0 )); then
+      printf '%s\n' "${policyname}" >>"$ILM_FAIL_FILE"
+    fi
+  } 200>>"${ILM_OUTPUT_LOCK}"
+
+  return "$rc"
+}
+
+# Block until fewer than MAX_JOBS background curls are running.
+throttle() {
+  while (( $(jobs -rp | wc -l) >= MAX_JOBS )); do
+    wait -n || true
+  done
+}
+
 {%- from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS %}
 {%- if GLOBALS.role != "so-heavynode" %}
 {%-   from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS %}
@@ -14,35 +56,36 @@
 {%- for index, settings in ES_INDEX_SETTINGS.items() %}
 {%-   if settings.policy is defined %}
 {%-     if index == 'so-logs-detections.alerts' %}
-  echo
-  echo "Setting up so-logs-detections.alerts-so policy..."
-  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-so" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
-  echo
+  throttle
+  put_policy "so-logs-detections.alerts-so" "{{ index }}-so" '{ "policy": {{ settings.policy | tojson(true) }} }' &
 {%-     elif index == 'so-logs-soc' %}
-  echo
-  echo "Setting up so-soc-logs policy..."
-  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/so-soc-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
-  echo
-  echo
-  echo "Setting up {{ index }}-logs policy..."
-  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
-  echo
+  throttle
+  put_policy "so-soc-logs" "so-soc-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
+  throttle
+  put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
 {%-     else %}
-  echo
-  echo "Setting up {{ index }}-logs policy..."
-  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
-  echo
+  throttle
+  put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
 {%-     endif %}
 {%-   endif %}
 {%- endfor %}
-echo
 {%- if GLOBALS.role != "so-heavynode" %}
 {%-   for index, settings in ALL_ADDON_SETTINGS.items() %}
 {%-     if settings.policy is defined %}
-  echo
-  echo "Setting up {{ index }}-logs policy..."
-  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
-  echo
+  throttle
+  put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
 {%-     endif %}
 {%-   endfor %}
 {%- endif %}
+
+wait || true
+
+if [[ -s "$ILM_FAIL_FILE" ]]; then
+  echo "ERROR: Failed to load ILM policy(s):"
+  while read -r POLICY; do
+    echo " - $POLICY"
+  done < "$ILM_FAIL_FILE"
+  exit 1
+else
+  echo "Successfully loaded all ILM policies."
+fi
@@ -1,3 +1,10 @@
 global:
  pcapengine: SURICATA
-  pipeline: REDIS
+  pipeline: REDIS
+  push:
+    enabled: true
+    highstate_interval_hours: 2
+    debounce_seconds: 30
+    drain_interval: 15
+    batch: '25%'
+    batch_wait: 15
@@ -59,4 +59,41 @@ global:
    description: Allows use of Endgame with Security Onion. This feature requires a license from Endgame.
    global: True
    advanced: True
+  push:
+    enabled:
+      description: Master kill-switch for the active push feature. When disabled, rule and pillar changes are picked up at the next scheduled highstate instead of being pushed immediately.
+      forcedType: bool
+      helpLink: push
+      global: True
+    highstate_interval_hours:
+      description: How often every minion in the grid runs a scheduled state.highstate, in hours. Lower values keep minions closer in sync at the cost of more load; higher values reduce load but increase worst-case latency for non-pushed changes. The salt-minion health check restarts a minion if its last highstate is older than this value plus one hour.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    debounce_seconds:
+      description: Trailing-edge debounce window in seconds. A push intent must be quiet for this long before the drainer dispatches. Rapid bursts of edits within this window coalesce into one dispatch.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    drain_interval:
+      description: How often the push drainer checks for ready intents, in seconds. Small values lower dispatch latency at the cost of more background work on the manager.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True
+    batch:
+      description: "Host batch size for push orchestrations. A number (e.g. '10') or a percentage (e.g. '25%'). Limits how many minions run the push state at once so large fleets don't thundering-herd."
+      helpLink: push
+      global: True
+      advanced: True
+      regex: '^([0-9]+%?)$'
+      regexFailureMessage: Enter a whole number or a whole-number percentage (e.g. 10 or 25%).
+    batch_wait:
+      description: Seconds to wait between host batches in a push orchestration. Gives the fleet time to breathe between waves.
+      forcedType: int
+      helpLink: push
+      global: True
+      advanced: True

@@ -58,6 +58,7 @@ so-hydra:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: hydraconfig
@@ -15,6 +15,7 @@ include:
 so-idh:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-idh:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - name: so-idh
    - detach: True
    - network_mode: host
@@ -18,6 +18,7 @@ include:
 so-influxdb:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-influxdb:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: influxdb
    - networks:
      - sobridge:
@@ -27,12 +27,13 @@ include:
 so-kafka:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kafka:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-kafka
    - name: so-kafka
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-kafka'].ip }}
-    - user: kafka
+    - user: "960"
    - environment:
        KAFKA_HEAP_OPTS: -Xmx2G -Xms1G
        KAFKA_OPTS: "-javaagent:/opt/jolokia/agents/jolokia-agent-jvm-javaagent.jar=port=8778,host={{ DOCKERMERGED.containers['so-kafka'].ip }},policyLocation=file:/opt/jolokia/jolokia.xml {%- if KAFKA_EXTERNAL_ACCESS %} -Djava.security.auth.login.config=/opt/kafka/config/kafka_server_jaas.conf {% endif -%}"
@@ -6,6 +6,7 @@
 {% from 'allowed_states.map.jinja' import allowed_states %}
 {% if sls.split('.')[0] in allowed_states %}
 {%   from 'docker/docker.map.jinja' import DOCKERMERGED %}
+{%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'vars/globals.map.jinja' import GLOBALS %}

 include:
@@ -16,8 +17,9 @@ include:
 so-kibana:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kibana:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: kibana
-    - user: kibana
+    - user: "932:0"
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-kibana'].ip }}
@@ -60,6 +62,19 @@ so-kibana:
    - watch:
      - file: kibanaconfig

+wait_for_so-kibana:
+  http.wait_for_successful_query:
+    - name: "http://localhost:5601/api/status"
+    - username: 'so_elastic'
+    - password: '{{ ELASTICSEARCHMERGED.auth.users.so_elastic_user.pass }}'
+    - ssl: True
+    - verify_ssl: False
+    - status: 200
+    - wait_for: 300
+    - request_interval: 15
+    - require:
+      - docker_container: so-kibana
+
 delete_so-kibana_so-status.disabled:
  file.uncomment:
    - name: /opt/so/conf/so-status/so-status.conf
@@ -51,6 +51,7 @@ so-kratos:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
+    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: kratosschema
@@ -103,7 +103,7 @@ kratos:
  config:
    session:
      lifespan: 
-        description: Defines the length of a login session.
+        description: Defines the length of a login session before it will timeout, and require a new login.
        global: True
        helpLink: kratos
      whoami:
@@ -26,12 +26,12 @@ logstash:
    manager:
      - so/0011_input_endgame.conf
      - so/0012_input_elastic_agent.conf.jinja
-      - so/0013_input_lumberjack_fleet.conf
+      - so/0013_input_lumberjack_fleet.conf.jinja
      - so/9999_output_redis.conf.jinja
    receiver:
      - so/0011_input_endgame.conf
      - so/0012_input_elastic_agent.conf.jinja
-      - so/0013_input_lumberjack_fleet.conf
+      - so/0013_input_lumberjack_fleet.conf.jinja
      - so/9999_output_redis.conf.jinja
    search:
      - so/0900_input_redis.conf.jinja
@@ -69,4 +69,5 @@ logstash:
    pipeline_x_batch_x_size: 125
    pipeline_x_ecs_compatibility: disabled
  dmz_nodes: []
+  latency_metrics: False

@@ -28,12 +28,13 @@ include:
 so-logstash:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-logstash:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-logstash
    - name: so-logstash
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-logstash'].ip }}
-    - user: logstash
+    - user: "931:0"
    - extra_hosts:
    {% for node in LOGSTASH_NODES %}
    {%   for hostname, ip in node.items() %}
@@ -1,3 +1,4 @@
+{%- from 'logstash/map.jinja' import LOGSTASH_MERGED %}
 input {
  elastic_agent {
    port => 5055
@@ -11,10 +12,15 @@ input {
  }
 }
 filter {
-if ![metadata] {
-  mutate {
-    rename => {"@metadata" => "metadata"}
+  {% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+  ruby {
+    code => "event.set('[_tmp][logstash_from_agent]', Time.now().utc.iso8601(3));"
+  }
+  {% endif %}
+  if ![metadata] {
+    mutate {
+      rename => {"@metadata" => "metadata"}
+    }
  }
 }
-}

@@ -1,23 +0,0 @@
-input {
-  elastic_agent {
-    port => 5056
-    tags => [ "elastic-agent", "fleet-lumberjack-input" ]
-    ssl_enabled => true
-    ssl_certificate => "/usr/share/logstash/elasticfleet-lumberjack.crt"
-    ssl_key => "/usr/share/logstash/elasticfleet-lumberjack.key"
-    ecs_compatibility => v8
-    id => "fleet-lumberjack-in"  
-    codec => "json"
-  }
-}
-
-
-filter {
-if ![metadata] {
-  mutate {
-    rename => {"@metadata" => "metadata"}
-  }
-}
-}
-
-
@@ -0,0 +1,26 @@
+{%- from 'logstash/map.jinja' import LOGSTASH_MERGED %}
+input {
+  elastic_agent {
+    port => 5056
+    tags => [ "elastic-agent", "fleet-lumberjack-input" ]
+    ssl_enabled => true
+    ssl_certificate => "/usr/share/logstash/elasticfleet-lumberjack.crt"
+    ssl_key => "/usr/share/logstash/elasticfleet-lumberjack.key"
+    ecs_compatibility => v8
+    id => "fleet-lumberjack-in"
+    codec => "json"
+  }
+}
+
+filter {
+  {% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+  ruby {
+    code => "event.set('[_tmp][logstash_from_fleet]', Time.now().utc.iso8601(3));"
+  }
+  {% endif %}
+  if ![metadata] {
+    mutate {
+      rename => {"@metadata" => "metadata"}
+    }
+  }
+}
@@ -1,3 +1,4 @@
+{%- from 'logstash/map.jinja' import LOGSTASH_MERGED %}
 {%- set kafka_password = salt['pillar.get']('kafka:config:password') %}
 {%- set kafka_trustpass = salt['pillar.get']('kafka:config:trustpass') %}
 {%- set kafka_brokers = salt['pillar.get']('kafka:nodes', {}) %}
@@ -30,6 +31,11 @@ input {
    }
 }
 filter {
+  {% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+  ruby {
+    code => "event.set('[_tmp][logstash_from_kafka]', Time.now().utc.iso8601(3));"
+  }
+  {% endif %}
  if ![metadata] {
    mutate {
      rename => { "@metadata" => "metadata" }
@@ -1,4 +1,4 @@
-{%- from 'logstash/map.jinja' import LOGSTASH_REDIS_NODES with context %}
+{%- from 'logstash/map.jinja' import LOGSTASH_REDIS_NODES, LOGSTASH_MERGED %}
 {%- set REDIS_PASS = salt['pillar.get']('redis:config:requirepass') %}

 {%- for index in range(LOGSTASH_REDIS_NODES|length) %}
@@ -18,3 +18,10 @@ input {
 }
 {%   endfor %}
 {% endfor -%}
+filter {
+  {% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+  ruby {
+    code => "event.set('[_tmp][logstash_from_redis]', Time.now().utc.iso8601(3));"
+  }
+  {% endif %}
+}
@@ -1,3 +1,11 @@
+{%- from 'logstash/map.jinja' import LOGSTASH_MERGED %}
+{% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+filter {
+  ruby {
+    code => "event.set('[_tmp][logstash_to_elasticsearch]', Time.now().utc.iso8601(3));"
+  }
+}
+{% endif %}
 output {
  if "elastic-agent" in [tags] and "so-ip-mappings" in [tags] {
    elasticsearch {
@@ -13,13 +13,20 @@ filter {
                    add_tag => "fleet-lumberjack-{{ GLOBALS.hostname }}"
          }
  }
-
-output { 
-    lumberjack { 
-        codec => json 
+{%- from 'logstash/map.jinja' import LOGSTASH_MERGED %}
+{% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+filter {
+  ruby {
+    code => "event.set('[_tmp][fleet_to_logstash]', Time.now().utc.iso8601(3));"
+  }
+}
+{% endif %}
+output {
+    lumberjack {
+        codec => json
        hosts => {{  FAILOVER_LOGSTASH_NODES }}
        ssl_certificate => "/usr/share/filebeat/ca.crt"
-        port => 5056 
+        port => 5056
        id => "fleet-lumberjack-{{ GLOBALS.hostname }}"
-        } 
+        }
    }
@@ -1,10 +1,17 @@
+{%- from 'logstash/map.jinja' import LOGSTASH_MERGED %}
 {%- if grains.role in ['so-heavynode', 'so-receiver'] %}
  {%- set HOST = GLOBALS.hostname %}
 {%- else %}
  {%- set HOST = GLOBALS.manager %}
 {%- endif %}
 {%- set REDIS_PASS = salt['pillar.get']('redis:config:requirepass') %}
-
+{% if LOGSTASH_MERGED.get('latency_metrics', False) %}
+filter {
+  ruby {
+    code => "event.set('[_tmp][logstash_to_redis]', Time.now().utc.iso8601(3));"
+  }
+}
+{% endif %}
 output {
 	redis {
 		host => '{{ HOST }}'
@@ -86,3 +86,8 @@ logstash:
    multiline: True
    advanced: True
    forcedType: "[]string"
+  latency_metrics:
+    description: Enable latency metrics within events processed by logstash. Useful for pinpointing log ingest delay.
+    forcedType: bool
+    global: False
+    advanced: True
@@ -0,0 +1,21 @@
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+{% from 'global/map.jinja' import GLOBALMERGED %}
+
+include:
+  - salt.minion
+
+{% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
+salt_beacons_pushstate:
+  file.managed:
+    - name: /etc/salt/minion.d/beacons_pushstate.conf
+    - source: salt://manager/files/beacons_pushstate.conf.jinja
+    - template: jinja
+    - watch_in:
+      - service: salt_minion_service
+{% else %}
+salt_beacons_pushstate:
+  file.absent:
+    - name: /etc/salt/minion.d/beacons_pushstate.conf
+    - watch_in:
+      - service: salt_minion_service
+{% endif %}
@@ -0,0 +1,41 @@
+{% from 'global/map.jinja' import GLOBALMERGED %}
+beacons:
+  pillar_db:
+    - interval: {{ GLOBALMERGED.push.drain_interval }}
+    - disable_during_state_run: True
+  inotify:
+    - disable_during_state_run: True
+    - coalesce: True
+    - files:
+        /opt/so/saltstack/local/salt/suricata/rules:
+          mask:
+            - close_write
+            - moved_to
+            - delete
+          recurse: True
+          auto_add: True
+          exclude:
+            - '\.sw[a-z]$':
+                regex: True
+            - '~$':
+                regex: True
+            - '/4913$':
+                regex: True
+            - '/\.#':
+                regex: True
+        /opt/so/saltstack/local/salt/strelka/rules/compiled:
+          mask:
+            - close_write
+            - moved_to
+            - delete
+          recurse: True
+          auto_add: True
+          exclude:
+            - '\.sw[a-z]$':
+                regex: True
+            - '~$':
+                regex: True
+            - '/4913$':
+                regex: True
+            - '/\.#':
+                regex: True
@@ -15,6 +15,7 @@ include:
  - manager.elasticsearch
  - manager.kibana
  - manager.managed_soc_annotations
+  - manager.beacons

 repo_log_dir:
  file.directory:
@@ -231,6 +232,7 @@ surifiltersrules:
    - user: 939
    - group: 939

+
 {% else %}

 {{sls}}_state_not_allowed:
@@ -16,40 +16,35 @@
 {%       endif %}
 {%     endfor %}
 {%   endfor %}
+{%   set soc_annotation_lines = [] %}
+{%   set defaults_lines = [] %}
+{%   for k in matched_integration_names %}
+{%     do soc_annotation_lines.append('    ' ~ k ~ ': *dataStreamSettings') %}
+{%     do defaults_lines.append('    ' ~ k ~ ':') %}
+{%     set defaults_yaml = salt['slsutil.serialize']('yaml', ADDON_INTEGRATION_DEFAULTS[k], default_flow_style=False).strip() %}
+{%     for line in defaults_yaml.splitlines() %}
+{%       do defaults_lines.append('      ' ~ line) %}
+{%     endfor %}
+{%   endfor %}
 {%   set es_soc_annotations = '/opt/so/saltstack/default/salt/elasticsearch/soc_elasticsearch.yaml' %}
-{{   es_soc_annotations }}:
-     file.serialize:
-       - dataset:
-           {% set data = salt['file.read'](es_soc_annotations) | load_yaml %}
-           {% set es = data.get('elasticsearch', {}) %}
-           {% set index_settings = es.get('index_settings', {}) %}
-           {% set input = index_settings.get('so-logs', {}) %}
-           {% for k in matched_integration_names %}
-           {%   do index_settings.update({k: input}) %}
-           {% endfor %}
-           {% for k in addon_integration_keys %}
-           {%   if k not in matched_integration_names and k in index_settings %}
-           {%     do index_settings.pop(k) %}
-           {%   endif %}
-           {% endfor %}
-           {{ data }}
+manage_soc_annotations:
+  file.blockreplace:
+    - name: {{ es_soc_annotations }}
+    - marker_start: '    # START managed SOC integration annotations'
+    - marker_end: '    # END managed SOC integration annotations'
+    - content: {{ soc_annotation_lines | join('\n') | tojson }}
+    - insert_after_match: '^    # Managed SOC integration annotations are inserted below this line\.'
+    - append_if_not_found: False
+    - show_changes: True

 {#   Managed elasticsearch/defaults.yaml file for enabling 'Revert to default' via SOC UI for newly added config items #}
 {%   set es_defaults = '/opt/so/saltstack/default/salt/elasticsearch/defaults.yaml' %}
 {{   es_defaults }}:
-     file.serialize:
-       - dataset:
-           {% set data = salt['file.read'](es_defaults) | load_yaml %}
-           {% set es = data.get('elasticsearch', {}) %}
-           {% set index_settings = es.get('index_settings', {}) %}
-           {% for k in matched_integration_names %}
-           {%   set input = ADDON_INTEGRATION_DEFAULTS[k] %}
-           {%     do index_settings.update({k: input})%}
-           {% endfor %}
-           {% for k in addon_integration_keys %}
-           {%   if k not in matched_integration_names and k in index_settings %}
-           {%     do index_settings.pop(k) %}
-           {%   endif %}
-           {% endfor %}
-           {{ data }}
-{% endif %}
+  file.blockreplace:
+    - marker_start: '    # START managed SOC integration defaults'
+    - marker_end: '    # END managed SOC integration defaults'
+    - content: {{ defaults_lines | join('\n') | tojson }}
+    - insert_after_match: '^  index_settings:$'
+    - append_if_not_found: False
+    - show_changes: True
+{% endif %}
@@ -31,11 +31,13 @@ sync_es_users:
      - http: wait_for_kratos
      - file: so-user.lock # require so-user.lock file to be missing

-# we dont want this added too early in setup, so we add the onlyif to verify 'startup_states: highstate'
-# is in the minion config. That line is added before the final highstate during setup
+# we dont want this added too early in setup, so the onlyif gates on the
+# /opt/so/state/setup-complete marker. The marker is written by
+# mark_setup_complete in setup/so-functions just before the final setup
+# highstate (and by an upgrade-path state for systems set up under the old gate).
 so-user_sync:
  cron.present:
    - user: root
    - name: 'PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin /usr/sbin/so-user sync &>> /opt/so/log/soc/sync.log'
    - identifier: so-user_sync
-    - onlyif: "grep -x 'startup_states: highstate' /etc/salt/minion"
+    - onlyif: "test -e /opt/so/state/setup-complete"
@@ -0,0 +1,117 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Runs once per boot on managers (via so-boot-mine-update.service), before
+# so-boot-highstate.service. Waits for the responsive minion set to settle, pushes
+# mine.update, waits until every up minion has actually reported to the mine, then
+# warms the master's per-minion pillar cache so the mine-backed node pillars (node
+# IPs, ES/Redis/Logstash/hypervisor discovery -- some glob- and some pillar/grain-
+# targeted) are complete before the boot highstate renders them. Otherwise a node
+# that is up but not yet fully reported gets dropped from those pillars and torn
+# out of the configs they build (e.g. so-elasticsearch ExtraHosts -> container recreate).
+
+MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180}   # hard backstop only
+INTERVAL=10
+STABLE_CHECKS=3                          # up-count must hold steady this many polls
+elapsed=0
+prev=-1
+stable=0
+up=0
+
+# Wait for the *reachable* minion set to settle rather than for every accepted
+# key to report up: an operator may accept a minion's key and then intentionally
+# power off that host, so requiring up >= accepted would never be satisfied and
+# we'd always burn the full MAX_WAIT. Once the responsive count stops growing we
+# stop waiting and run mine.update against whoever is up.
+while [ "$elapsed" -lt "$MAX_WAIT" ]; do
+  up=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null \
+    | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null)
+  up=${up:-0}
+  if [ "$up" -gt 0 ] && [ "$up" -eq "$prev" ]; then
+    stable=$((stable + 1))
+    [ "$stable" -ge "$STABLE_CHECKS" ] && break
+  else
+    stable=0
+  fi
+  prev=$up
+  sleep "$INTERVAL"
+  elapsed=$((elapsed + INTERVAL))
+done
+
+echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update"
+/usr/bin/salt '*' mine.update --out=txt
+
+# A node that is up but has not yet re-reported network.ip_addrs to the mine is
+# silently dropped from mine-backed pillars (elasticsearch:nodes, node_data, ...)
+# when highstate recompiles them -- which e.g. removes it from so-elasticsearch
+# ExtraHosts and forces a container recreate. After the broad mine.update above,
+# wait until every up minion actually has network.ip_addrs in the mine, re-pushing
+# mine.update to stragglers, before releasing the boot highstate. Bounded by the
+# same MAX_WAIT backstop so a slow/down node never blocks boot indefinitely.
+missing=""
+while [ "$elapsed" -lt "$MAX_WAIT" ]; do
+  up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
+  mine_json=$(/usr/bin/salt-run mine.get '*' network.ip_addrs tgt_type=glob --out=json 2>/dev/null)
+  missing=$(printf '%s' "$up_json" | python3 -c '
+import sys, json
+up = set(json.load(sys.stdin) or [])
+mine = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
+print("\n".join(sorted(up - mine)))
+' "$mine_json" 2>/dev/null)
+  if [ -z "$missing" ]; then
+    echo "so-boot-mine-update: mine complete for all up minions after ${elapsed}s"
+    break
+  fi
+  echo "so-boot-mine-update: mine missing up minion(s): $(echo $missing); re-running mine.update"
+  for m in $missing; do /usr/bin/salt "$m" mine.update --out=txt; done
+  sleep "$INTERVAL"
+  elapsed=$((elapsed + INTERVAL))
+done
+[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs"
+
+# The pillar/compound-targeted node pillars (elasticsearch:nodes, redis:nodes,
+# logstash:nodes, hypervisor:nodes) resolve their target against the master's
+# per-minion data cache (grains+pillar in .../minions/<id>/data.p), populated only
+# when a minion's pillar is (re)compiled -- separately from the mine. A freshly
+# booted node can be in the mine (glob/node_data sees it) yet absent from that
+# cache, so it is dropped from those pillars and from the configs they build (e.g.
+# so-elasticsearch ExtraHosts). Force a synchronous pillar refresh so the master
+# caches every up node's pillar; refresh_pillar wait=True returns only once the
+# pillar is recompiled (and thus cached for matching). Retry stragglers <= MAX_WAIT.
+echo "so-boot-mine-update: warming master pillar cache for pillar/grain-targeted node pillars"
+/usr/bin/salt '*' saltutil.refresh_pillar wait=True --out=txt
+missing=""
+while [ "$elapsed" -lt "$MAX_WAIT" ]; do
+  up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
+  cached_json=$(/usr/bin/salt-run cache.pillar tgt='*' --out=json 2>/dev/null)
+  missing=$(printf '%s' "$up_json" | python3 -c '
+import sys, json
+up = set(json.load(sys.stdin) or [])
+cached = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
+print("\n".join(sorted(up - cached)))
+' "$cached_json" 2>/dev/null)
+  if [ -z "$missing" ]; then
+    echo "so-boot-mine-update: pillar cache warm for all up minions after ${elapsed}s"
+    break
+  fi
+  echo "so-boot-mine-update: pillar not yet cached for: $(echo $missing); refreshing"
+  for m in $missing; do /usr/bin/salt "$m" saltutil.refresh_pillar wait=True --out=txt; done
+  sleep "$INTERVAL"
+  elapsed=$((elapsed + INTERVAL))
+done
+[ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; pillar not cached for: $(echo $missing); pillar-targeted pillars may drop them"
+
+# Log what the mine-backed pillars render so the boot-time state is inspectable.
+/usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1
+sleep 2
+for key in node_data elasticsearch:nodes; do
+  rendered=$(/usr/bin/salt-call --out=json pillar.get "$key" 2>/dev/null \
+    | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null)
+  echo "so-boot-mine-update: ${key} rendered as:"
+  echo "${rendered:-null}"
+done
+exit 0
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Imports detection overrides (e.g. from so-detections-backup) into the so-detection
+# index. Reads <publicId>.<ext> files (NDJSON, one override per line) from a source
+# directory, looks up the matching detection by publicId+engine, validates each
+# override against the same rules SOC enforces, dedupes against existing overrides
+# (operational fields only), and appends new ones.
+
+import argparse
+import ipaddress
+import json
+import os
+import re
+import sys
+from datetime import datetime
+
+import requests
+from requests.auth import HTTPBasicAuth
+import urllib3
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+DEFAULT_INDEX = "so-detection"
+AUTH_FILE = "/opt/so/conf/elasticsearch/curl.config"
+ES_URL = "https://localhost:9200"
+
+# Engines we know how to handle and the file extension the backup script writes.
+ENGINES = {
+    "suricata": "txt",
+}
+
+# Standard Suricata variables that ship with Security Onion. Anything else
+# referenced in an override is "custom" and the user needs to make sure it
+# exists in SOC Config before the override will function.
+BUILTIN_SURICATA_VARS = {
+    "$HOME_NET", "$EXTERNAL_NET",
+    "$HTTP_SERVERS", "$DNS_SERVERS", "$SQL_SERVERS", "$SMTP_SERVERS",
+    "$TELNET_SERVERS", "$AIM_SERVERS", "$DC_SERVERS", "$MODBUS_SERVER",
+    "$MODBUS_CLIENT", "$ENIP_CLIENT", "$ENIP_SERVER",
+    "$HTTP_PORTS", "$SHELLCODE_PORTS", "$ORACLE_PORTS", "$SSH_PORTS",
+    "$FTP_PORTS", "$FILE_DATA_PORTS",
+}
+
+VAR_PATTERN = re.compile(r"\$[A-Z_][A-Z0-9_]*")
+
+# Canonical valid values, per securityonion-soc/model/detection.go.
+SURICATA_OVERRIDE_TYPES = {"suppress", "threshold", "modify"}
+SUPPRESS_TRACKS = {"by_src", "by_dst", "by_either"}
+THRESHOLD_TRACKS = {"by_src", "by_dst", "by_both"}
+THRESHOLD_TYPES = {"limit", "threshold", "both"}
+
+STALE_WARNING = """\
+WARNING: so-detections-backup does not remove backup files when overrides are
+deleted via the Security Onion web UI. As a result, files in the source
+directory may represent overrides that were intentionally deleted and should
+NOT be re-imported.
+
+Before continuing, verify that the source directory reflects the overrides you
+actually want imported. Remove any files corresponding to overrides you previously deleted.
+"""
+
+
+def make_session(auth_file):
+    with open(auth_file, "r") as f:
+        for line in f:
+            if line.startswith("user ="):
+                creds = line.split("=", 1)[1].strip().replace('"', "")
+                user, _, password = creds.partition(":")
+                session = requests.Session()
+                session.auth = HTTPBasicAuth(user, password)
+                session.headers.update({"Content-Type": "application/json"})
+                session.verify = False
+                return session
+    raise RuntimeError(f"Could not find 'user =' line in {auth_file}")
+
+
+def find_detection(session, index, public_id, engine):
+    query = {
+        "query": {"bool": {"must": [
+            {"term": {"so_detection.publicId": public_id}},
+            {"term": {"so_detection.engine": engine}},
+        ]}},
+        "size": 2,
+    }
+    r = session.get(f"{ES_URL}/{index}/_search", json=query)
+    r.raise_for_status()
+    hits = r.json().get("hits", {}).get("hits", [])
+    if not hits:
+        return None, None, None
+    if len(hits) > 1:
+        # Shouldn't happen — publicId is unique per engine — but flag it.
+        print(f"  WARN: {len(hits)} detections matched publicId={public_id} engine={engine}; using first")
+    hit = hits[0]
+    existing = hit["_source"].get("so_detection", {}).get("overrides") or []
+    return hit["_id"], hit["_index"], existing
+
+
+def update_overrides(session, doc_index, doc_id, overrides):
+    body = {"doc": {"so_detection": {"overrides": overrides}}}
+    r = session.post(f"{ES_URL}/{doc_index}/_update/{doc_id}", json=body)
+    r.raise_for_status()
+    return r.json()
+
+
+def dedupe_key(override):
+    """Operational fields only, per Override.Equal() in detection.go.
+    Excludes timestamps and isEnabled so re-imports don't appear unique."""
+    t = override.get("type")
+    if t == "suppress":
+        return (t, override.get("track"), override.get("ip"))
+    if t == "threshold":
+        return (t, override.get("thresholdType"), override.get("track"),
+                override.get("count"), override.get("seconds"))
+    if t == "modify":
+        return (t, override.get("regex"), override.get("value"))
+
+
+def _validate_suricata_ip(ip):
+    if not ip:
+        return "ip cannot be empty"
+    if ip.startswith("$"):
+        return None
+    if ip.startswith("[") and ip.endswith("]"):
+        for part in ip[1:-1].split(","):
+            err = _validate_single_ip(part.strip())
+            if err:
+                return f"invalid IP in list: {err}"
+        return None
+    return _validate_single_ip(ip)
+
+
+def _validate_single_ip(ip):
+    try:
+        if "/" in ip:
+            ipaddress.ip_network(ip, strict=False)
+        else:
+            ipaddress.ip_address(ip)
+    except ValueError:
+        return f"invalid IP/CIDR {ip!r}"
+    return None
+
+
+def validate_override(override, engine):
+    """Mirror Override.Validate() from securityonion-soc/model/detection.go.
+    Returns None on success, an error string otherwise."""
+    t = override.get("type")
+    if not t:
+        return "override type is required"
+    if t not in SURICATA_OVERRIDE_TYPES:
+        return f"invalid type {t!r}: must be one of {sorted(SURICATA_OVERRIDE_TYPES)}"
+
+    has = {k: override.get(k) is not None for k in
+           ("regex", "value", "thresholdType", "track", "ip", "count", "seconds", "customFilter")}
+
+    if t == "suppress":
+        if not has["ip"] or not has["track"]:
+            return "suppress requires 'ip' and 'track'"
+        if any(has[k] for k in ("regex", "value", "thresholdType", "count", "seconds", "customFilter")):
+            return "suppress has unnecessary fields"
+        if override["track"] not in SUPPRESS_TRACKS:
+            return f"invalid track {override['track']!r}: must be one of {sorted(SUPPRESS_TRACKS)}"
+        return _validate_suricata_ip(override["ip"])
+
+    if t == "threshold":
+        if not all(has[k] for k in ("thresholdType", "track", "count", "seconds")):
+            return "threshold requires 'thresholdType', 'track', 'count', 'seconds'"
+        if any(has[k] for k in ("regex", "value", "customFilter")):
+            return "threshold has unnecessary fields"
+        if override["thresholdType"] not in THRESHOLD_TYPES:
+            return f"invalid thresholdType {override['thresholdType']!r}: must be one of {sorted(THRESHOLD_TYPES)}"
+        if override["track"] not in THRESHOLD_TRACKS:
+            return f"invalid track {override['track']!r}: must be one of {sorted(THRESHOLD_TRACKS)}"
+        if not isinstance(override["count"], int) or override["count"] <= 0:
+            return f"count must be a positive integer, got {override['count']!r}"
+        if not isinstance(override["seconds"], int) or override["seconds"] <= 0:
+            return f"seconds must be a positive integer, got {override['seconds']!r}"
+        return None
+
+    if t == "modify":
+        if not has["regex"] or not has["value"]:
+            return "modify requires 'regex' and 'value'"
+        if any(has[k] for k in ("thresholdType", "track", "count", "seconds", "customFilter")):
+            return "modify has unnecessary fields"
+        try:
+            re.compile(override["regex"])
+        except re.error as e:
+            return f"invalid regex: {e}"
+        return None
+
+
+def parse_overrides_file(path):
+    """Parse a file written by so-detections-backup.py: NDJSON, one override
+    per line. Returns a list of (override_dict, line_number)."""
+    overrides = []
+    with open(path, "r") as f:
+        for i, line in enumerate(f, start=1):
+            line = line.strip()
+            if not line:
+                continue
+            overrides.append((json.loads(line), i))
+    return overrides
+
+
+def describe(override):
+    """Human-readable summary of the operational fields for a given override type."""
+    t = override.get("type")
+    if t == "suppress":
+        return f"type=suppress track={override.get('track')} ip={override.get('ip')}"
+    if t == "threshold":
+        return (f"type=threshold track={override.get('track')} "
+                f"thresholdType={override.get('thresholdType')} "
+                f"count={override.get('count')} seconds={override.get('seconds')}")
+    if t == "modify":
+        return f"type=modify regex={override.get('regex')!r}"
+
+
+def collect_custom_vars(override):
+    found = set()
+    for value in override.values():
+        if isinstance(value, str):
+            for match in VAR_PATTERN.findall(value):
+                if match not in BUILTIN_SURICATA_VARS:
+                    found.add(match)
+    return found
+
+
+def parse_args():
+    p = argparse.ArgumentParser(
+        description="Import detection overrides into the so-detection index.",
+    )
+    p.add_argument("--source", "-s", required=True,
+                   help="Source directory containing <publicId>.<ext> override files.")
+    p.add_argument("--engine", "-e", default="suricata", choices=list(ENGINES.keys()),
+                   help="Detection engine (default: suricata).")
+    p.add_argument("--dry-run", "-n", action="store_true",
+                   help="Print what would happen without writing to Elasticsearch.")
+    p.add_argument("--no-import-note", action="store_true",
+                   help="Do not prepend '[Imported YYYY-MM-DD] ' to the override note.")
+    p.add_argument("--index", "-i", default=DEFAULT_INDEX,
+                   help=f"Elasticsearch index to update (default: {DEFAULT_INDEX}).")
+    return p.parse_args()
+
+
+def confirm_proceed(args):
+    """Show the stale-backup warning. Dry-run prints it and continues. Real
+    runs require the user typing 'yes' at the prompt."""
+    print(STALE_WARNING)
+    if args.dry_run:
+        print("(dry-run: no acknowledgement required)\n")
+        return True
+    answer = input("Type 'yes' to acknowledge and continue: ").strip().lower()
+    print()
+    return answer == "yes"
+
+
+def main():
+    args = parse_args()
+
+    if not os.path.isdir(args.source):
+        print(f"ERROR: source directory not found: {args.source}", file=sys.stderr)
+        sys.exit(1)
+
+    extension = ENGINES[args.engine]
+    files = sorted(f for f in os.listdir(args.source) if f.endswith(f".{extension}"))
+    if not files:
+        print(f"No *.{extension} files found in {args.source}")
+        sys.exit(0)
+
+    if not confirm_proceed(args):
+        print("Aborted.")
+        sys.exit(1)
+
+    session = make_session(AUTH_FILE)
+    today = datetime.now().strftime("%Y-%m-%d")
+    note_prefix = "" if args.no_import_note else f"[Imported {today}] "
+
+    counts = {"added": 0, "skipped_dedupe": 0, "skipped_not_found": 0, "invalid": 0, "error": 0}
+    custom_vars = set()
+
+    mode = "DRY-RUN" if args.dry_run else "IMPORT"
+    print(f"[{mode}] engine={args.engine} source={args.source} index={args.index}\n")
+
+    for filename in files:
+        public_id = os.path.splitext(filename)[0]
+        path = os.path.join(args.source, filename)
+        print(f"{public_id}:")
+
+        try:
+            new_overrides = parse_overrides_file(path)
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"  ERROR: could not parse {filename}: {e}")
+            counts["error"] += 1
+            continue
+
+        if not new_overrides:
+            print("  SKIP: empty file")
+            continue
+
+        try:
+            doc_id, doc_index, existing = find_detection(session, args.index, public_id, args.engine)
+        except requests.HTTPError as e:
+            print(f"  ERROR: search failed: {e}")
+            counts["error"] += 1
+            continue
+
+        if doc_id is None:
+            print(f"  WARN: no detection found for publicId={public_id} engine={args.engine}; skipping")
+            counts["skipped_not_found"] += len(new_overrides)
+            continue
+
+        existing_keys = {dedupe_key(o) for o in existing}
+        merged = list(existing)
+        added_this_file = 0
+
+        for override, line_no in new_overrides:
+            err = validate_override(override, args.engine)
+            if err:
+                print(f"  INVALID (line {line_no}): {err}")
+                counts["invalid"] += 1
+                continue
+
+            custom_vars.update(collect_custom_vars(override))
+            key = dedupe_key(override)
+            if key in existing_keys:
+                print(f"  SKIP (line {line_no}): duplicate of existing override [{describe(override)}]")
+                counts["skipped_dedupe"] += 1
+                continue
+
+            if note_prefix:
+                override = dict(override)
+                override["note"] = note_prefix + (override.get("note") or "")
+
+            merged.append(override)
+            existing_keys.add(key)
+            added_this_file += 1
+            print(f"  ADD (line {line_no}): {describe(override)}")
+
+        if added_this_file == 0:
+            continue
+
+        if args.dry_run:
+            print(f"  DRY-RUN: would update {doc_index}/{doc_id} "
+                  f"({len(existing)} existing → {len(merged)} total)")
+            counts["added"] += added_this_file
+            continue
+
+        try:
+            update_overrides(session, doc_index, doc_id, merged)
+            print(f"  UPDATED {doc_index}/{doc_id} ({len(existing)} → {len(merged)})")
+            counts["added"] += added_this_file
+        except requests.HTTPError as e:
+            print(f"  ERROR: update failed: {e}")
+            counts["error"] += 1
+
+    print()
+    print("=" * 60)
+    print(f"Summary ({mode}):")
+    print(f"  Overrides added:           {counts['added']}")
+    print(f"  Skipped (already present): {counts['skipped_dedupe']}")
+    print(f"  Skipped (no detection):    {counts['skipped_not_found']}")
+    print(f"  Invalid (failed checks):   {counts['invalid']}")
+    print(f"  Errors:                    {counts['error']}")
+
+    if custom_vars:
+        print()
+        print("WARNING: detected custom Suricata variables in imported overrides:")
+        for v in sorted(custom_vars):
+            print(f"  {v}")
+        print("If any of these are not already defined in SOC Config (Suricata variables),")
+        print("you must add them manually before the rules will function correctly.")
+
+    sys.exit(0 if counts["error"] == 0 and counts["invalid"] == 0 else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,588 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+import importlib.util
+import json
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+from importlib.machinery import SourceFileLoader
+from io import StringIO
+from unittest.mock import MagicMock, patch
+
+import requests
+
+# The script has no .py extension; spec_from_file_location can't auto-detect a
+# loader, so we hand it a SourceFileLoader explicitly. (load_module() is
+# deprecated in 3.14 and slated for removal in 3.15.)
+HERE = os.path.dirname(os.path.abspath(__file__))
+SCRIPT = os.path.join(HERE, "so-detections-overrides-import")
+_loader = SourceFileLoader("so_overrides_import", SCRIPT)
+_spec = importlib.util.spec_from_loader("so_overrides_import", _loader)
+soi = importlib.util.module_from_spec(_spec)
+_loader.exec_module(soi)
+
+
+class TestValidateSuppress(unittest.TestCase):
+    def test_valid(self):
+        self.assertIsNone(soi.validate_override(
+            {"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}, "suricata"))
+
+    def test_valid_var(self):
+        self.assertIsNone(soi.validate_override(
+            {"type": "suppress", "track": "by_either", "ip": "$HOME_NET"}, "suricata"))
+
+    def test_valid_cidr(self):
+        self.assertIsNone(soi.validate_override(
+            {"type": "suppress", "track": "by_dst", "ip": "10.0.0.0/8"}, "suricata"))
+
+    def test_valid_bracket_list(self):
+        self.assertIsNone(soi.validate_override(
+            {"type": "suppress", "track": "by_src", "ip": "[1.2.3.4,10.0.0.0/8]"}, "suricata"))
+
+    def test_missing_ip(self):
+        err = soi.validate_override({"type": "suppress", "track": "by_src"}, "suricata")
+        self.assertIn("requires", err)
+
+    def test_missing_track(self):
+        err = soi.validate_override({"type": "suppress", "ip": "1.2.3.4"}, "suricata")
+        self.assertIn("requires", err)
+
+    def test_invalid_track(self):
+        err = soi.validate_override(
+            {"type": "suppress", "track": "by_both", "ip": "1.2.3.4"}, "suricata")
+        self.assertIn("invalid track", err)
+
+    def test_invalid_ip(self):
+        err = soi.validate_override(
+            {"type": "suppress", "track": "by_src", "ip": "not-an-ip"}, "suricata")
+        self.assertIn("invalid IP", err)
+
+    def test_unnecessary_field(self):
+        err = soi.validate_override(
+            {"type": "suppress", "track": "by_src", "ip": "1.2.3.4", "count": 5}, "suricata")
+        self.assertIn("unnecessary fields", err)
+
+
+class TestValidateThreshold(unittest.TestCase):
+    def test_valid(self):
+        self.assertIsNone(soi.validate_override({
+            "type": "threshold", "track": "by_src",
+            "thresholdType": "limit", "count": 10, "seconds": 60,
+        }, "suricata"))
+
+    def test_valid_by_both(self):
+        self.assertIsNone(soi.validate_override({
+            "type": "threshold", "track": "by_both",
+            "thresholdType": "both", "count": 1, "seconds": 1,
+        }, "suricata"))
+
+    def test_track_by_either_invalid(self):
+        err = soi.validate_override({
+            "type": "threshold", "track": "by_either",
+            "thresholdType": "limit", "count": 10, "seconds": 60,
+        }, "suricata")
+        self.assertIn("invalid track", err)
+
+    def test_invalid_threshold_type(self):
+        err = soi.validate_override({
+            "type": "threshold", "track": "by_src",
+            "thresholdType": "bogus", "count": 10, "seconds": 60,
+        }, "suricata")
+        self.assertIn("invalid thresholdType", err)
+
+    def test_zero_count(self):
+        err = soi.validate_override({
+            "type": "threshold", "track": "by_src",
+            "thresholdType": "limit", "count": 0, "seconds": 60,
+        }, "suricata")
+        self.assertIn("count", err)
+
+    def test_negative_seconds(self):
+        err = soi.validate_override({
+            "type": "threshold", "track": "by_src",
+            "thresholdType": "limit", "count": 10, "seconds": -1,
+        }, "suricata")
+        self.assertIn("seconds", err)
+
+    def test_missing_field(self):
+        err = soi.validate_override({
+            "type": "threshold", "track": "by_src",
+            "thresholdType": "limit", "count": 10,  # missing seconds
+        }, "suricata")
+        self.assertIn("requires", err)
+
+    def test_unnecessary_field(self):
+        err = soi.validate_override({
+            "type": "threshold", "track": "by_src",
+            "thresholdType": "limit", "count": 10, "seconds": 60,
+            "regex": "foo",
+        }, "suricata")
+        self.assertIn("unnecessary fields", err)
+
+
+class TestValidateModify(unittest.TestCase):
+    def test_valid(self):
+        self.assertIsNone(soi.validate_override(
+            {"type": "modify", "regex": r"content:\"foo\"", "value": "content:bar"}, "suricata"))
+
+    def test_invalid_regex(self):
+        err = soi.validate_override(
+            {"type": "modify", "regex": "(unbalanced", "value": "x"}, "suricata")
+        self.assertIn("invalid regex", err)
+
+    def test_missing_value(self):
+        err = soi.validate_override({"type": "modify", "regex": "x"}, "suricata")
+        self.assertIn("requires", err)
+
+    def test_unnecessary_field(self):
+        err = soi.validate_override(
+            {"type": "modify", "regex": "x", "value": "y", "track": "by_src"}, "suricata")
+        self.assertIn("unnecessary fields", err)
+
+
+class TestValidateMisc(unittest.TestCase):
+    def test_unknown_type(self):
+        err = soi.validate_override({"type": "suppresss", "track": "by_src", "ip": "1.2.3.4"}, "suricata")
+        self.assertIn("invalid type", err)
+
+    def test_missing_type(self):
+        err = soi.validate_override({"track": "by_src"}, "suricata")
+        self.assertIn("type is required", err)
+
+
+class TestValidateIP(unittest.TestCase):
+    def test_plain_ipv4(self):
+        self.assertIsNone(soi._validate_suricata_ip("1.2.3.4"))
+
+    def test_plain_ipv6(self):
+        self.assertIsNone(soi._validate_suricata_ip("::1"))
+
+    def test_cidr(self):
+        self.assertIsNone(soi._validate_suricata_ip("10.0.0.0/8"))
+
+    def test_var(self):
+        self.assertIsNone(soi._validate_suricata_ip("$CONCOURSEWORKERS"))
+
+    def test_bracket_list(self):
+        self.assertIsNone(soi._validate_suricata_ip("[1.2.3.4, 10.0.0.0/8]"))
+
+    def test_bracket_list_bad_member(self):
+        err = soi._validate_suricata_ip("[1.2.3.4,nope]")
+        self.assertIn("invalid IP in list", err)
+
+    def test_empty(self):
+        self.assertIn("empty", soi._validate_suricata_ip(""))
+
+    def test_invalid(self):
+        self.assertIn("invalid", soi._validate_suricata_ip("999.999.999.999"))
+
+
+class TestDedupeKey(unittest.TestCase):
+    def test_suppress(self):
+        a = {"type": "suppress", "track": "by_src", "ip": "1.2.3.4", "count": 99}
+        b = {"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}
+        # count is irrelevant for suppress dedupe
+        self.assertEqual(soi.dedupe_key(a), soi.dedupe_key(b))
+
+    def test_suppress_differs_on_ip(self):
+        a = {"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}
+        b = {"type": "suppress", "track": "by_src", "ip": "5.6.7.8"}
+        self.assertNotEqual(soi.dedupe_key(a), soi.dedupe_key(b))
+
+    def test_threshold(self):
+        a = {"type": "threshold", "track": "by_src", "thresholdType": "limit",
+             "count": 10, "seconds": 60, "ip": "ignored"}
+        b = {"type": "threshold", "track": "by_src", "thresholdType": "limit",
+             "count": 10, "seconds": 60}
+        self.assertEqual(soi.dedupe_key(a), soi.dedupe_key(b))
+
+    def test_threshold_differs_on_count(self):
+        a = {"type": "threshold", "track": "by_src", "thresholdType": "limit",
+             "count": 10, "seconds": 60}
+        b = {"type": "threshold", "track": "by_src", "thresholdType": "limit",
+             "count": 20, "seconds": 60}
+        self.assertNotEqual(soi.dedupe_key(a), soi.dedupe_key(b))
+
+    def test_modify(self):
+        a = {"type": "modify", "regex": "x", "value": "y"}
+        b = {"type": "modify", "regex": "x", "value": "y"}
+        self.assertEqual(soi.dedupe_key(a), soi.dedupe_key(b))
+
+
+class TestDescribe(unittest.TestCase):
+    def test_suppress(self):
+        s = soi.describe({"type": "suppress", "track": "by_src", "ip": "1.2.3.4"})
+        self.assertIn("suppress", s)
+        self.assertIn("by_src", s)
+        self.assertIn("1.2.3.4", s)
+
+    def test_threshold_includes_count(self):
+        s = soi.describe({"type": "threshold", "track": "by_src",
+                          "thresholdType": "limit", "count": 10, "seconds": 60})
+        self.assertIn("count=10", s)
+        self.assertIn("seconds=60", s)
+
+    def test_modify(self):
+        s = soi.describe({"type": "modify", "regex": "foo"})
+        self.assertIn("modify", s)
+        self.assertIn("foo", s)
+
+
+class TestParseOverridesFile(unittest.TestCase):
+    def _write(self, content):
+        fd, path = tempfile.mkstemp(suffix=".txt")
+        os.close(fd)
+        with open(path, "w") as f:
+            f.write(content)
+        self.addCleanup(os.unlink, path)
+        return path
+
+    def test_single_line(self):
+        path = self._write('{"type":"suppress","track":"by_src","ip":"1.2.3.4"}')
+        result = soi.parse_overrides_file(path)
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0][0]["type"], "suppress")
+        self.assertEqual(result[0][1], 1)
+
+    def test_ndjson(self):
+        path = self._write(
+            '{"type":"suppress","track":"by_src","ip":"1.2.3.4"}\n'
+            '{"type":"suppress","track":"by_dst","ip":"5.6.7.8"}\n'
+        )
+        result = soi.parse_overrides_file(path)
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[1][1], 2)
+
+    def test_empty(self):
+        path = self._write("")
+        self.assertEqual(soi.parse_overrides_file(path), [])
+
+    def test_blank_lines_skipped(self):
+        path = self._write('\n{"type":"suppress","track":"by_src","ip":"1.2.3.4"}\n\n')
+        result = soi.parse_overrides_file(path)
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0][1], 2)  # line number reflects original position
+
+    def test_invalid_raises(self):
+        path = self._write("not json")
+        with self.assertRaises(json.JSONDecodeError):
+            soi.parse_overrides_file(path)
+
+
+class TestCollectCustomVars(unittest.TestCase):
+    def test_finds_custom(self):
+        v = soi.collect_custom_vars({"ip": "$CONCOURSEWORKERS"})
+        self.assertEqual(v, {"$CONCOURSEWORKERS"})
+
+    def test_filters_builtins(self):
+        v = soi.collect_custom_vars({"ip": "$HOME_NET"})
+        self.assertEqual(v, set())
+
+    def test_mixed(self):
+        v = soi.collect_custom_vars({"ip": "[$HOME_NET,$MYNET]"})
+        self.assertEqual(v, {"$MYNET"})
+
+    def test_non_string_fields_ignored(self):
+        v = soi.collect_custom_vars({"count": 10, "isEnabled": True})
+        self.assertEqual(v, set())
+
+
+class TestMakeSession(unittest.TestCase):
+    def _write(self, content):
+        fd, path = tempfile.mkstemp()
+        os.close(fd)
+        with open(path, "w") as f:
+            f.write(content)
+        self.addCleanup(os.unlink, path)
+        return path
+
+    def test_valid_auth_file(self):
+        path = self._write('user = "admin:secret"\n')
+        session = soi.make_session(path)
+        self.assertEqual(session.auth.username, "admin")
+        self.assertEqual(session.auth.password, "secret")
+        self.assertFalse(session.verify)
+
+    def test_missing_user_line(self):
+        path = self._write("# no user line here\n")
+        with self.assertRaises(RuntimeError):
+            soi.make_session(path)
+
+
+class TestFindDetection(unittest.TestCase):
+    def _session_with_response(self, payload):
+        session = MagicMock()
+        response = MagicMock()
+        response.json.return_value = payload
+        response.raise_for_status.return_value = None
+        session.get.return_value = response
+        return session
+
+    def test_found(self):
+        session = self._session_with_response({"hits": {"hits": [{
+            "_id": "abc", "_index": "so-detection",
+            "_source": {"so_detection": {"overrides": [{"type": "suppress"}]}},
+        }]}})
+        doc_id, idx, existing = soi.find_detection(session, "so-detection", "2049201", "suricata")
+        self.assertEqual(doc_id, "abc")
+        self.assertEqual(idx, "so-detection")
+        self.assertEqual(len(existing), 1)
+
+    def test_not_found(self):
+        session = self._session_with_response({"hits": {"hits": []}})
+        doc_id, idx, existing = soi.find_detection(session, "so-detection", "x", "suricata")
+        self.assertIsNone(doc_id)
+        self.assertIsNone(idx)
+        self.assertIsNone(existing)
+
+    def test_no_overrides_field(self):
+        session = self._session_with_response({"hits": {"hits": [{
+            "_id": "abc", "_index": "so-detection",
+            "_source": {"so_detection": {}},
+        }]}})
+        _, _, existing = soi.find_detection(session, "so-detection", "x", "suricata")
+        self.assertEqual(existing, [])
+
+    def test_multiple_hits_warns(self):
+        session = self._session_with_response({"hits": {"hits": [
+            {"_id": "a", "_index": "i", "_source": {"so_detection": {"overrides": []}}},
+            {"_id": "b", "_index": "i", "_source": {"so_detection": {"overrides": []}}},
+        ]}})
+        with patch("sys.stdout", new=StringIO()) as out:
+            doc_id, _, _ = soi.find_detection(session, "i", "x", "suricata")
+        self.assertEqual(doc_id, "a")
+        self.assertIn("WARN", out.getvalue())
+
+
+class TestUpdateOverrides(unittest.TestCase):
+    def test_posts_to_update_endpoint(self):
+        session = MagicMock()
+        response = MagicMock()
+        response.raise_for_status.return_value = None
+        response.json.return_value = {"result": "updated"}
+        session.post.return_value = response
+
+        result = soi.update_overrides(session, "so-detection", "abc", [{"type": "suppress"}])
+
+        self.assertEqual(result, {"result": "updated"})
+        url = session.post.call_args[0][0]
+        self.assertIn("/_update/abc", url)
+        body = session.post.call_args[1]["json"]
+        self.assertEqual(body["doc"]["so_detection"]["overrides"], [{"type": "suppress"}])
+
+
+class TestConfirmProceed(unittest.TestCase):
+    def test_dry_run_skips_prompt(self):
+        args = MagicMock(dry_run=True)
+        with patch("sys.stdout", new=StringIO()):
+            self.assertTrue(soi.confirm_proceed(args))
+
+    def test_yes_input(self):
+        args = MagicMock(dry_run=False)
+        with patch("sys.stdout", new=StringIO()):
+            with patch("builtins.input", return_value="yes"):
+                self.assertTrue(soi.confirm_proceed(args))
+
+    def test_yes_input_case_insensitive(self):
+        args = MagicMock(dry_run=False)
+        with patch("sys.stdout", new=StringIO()):
+            with patch("builtins.input", return_value="YES"):
+                self.assertTrue(soi.confirm_proceed(args))
+
+    def test_no_input_aborts(self):
+        args = MagicMock(dry_run=False)
+        with patch("sys.stdout", new=StringIO()):
+            with patch("builtins.input", return_value="no"):
+                self.assertFalse(soi.confirm_proceed(args))
+
+    def test_empty_input_aborts(self):
+        args = MagicMock(dry_run=False)
+        with patch("sys.stdout", new=StringIO()):
+            with patch("builtins.input", return_value=""):
+                self.assertFalse(soi.confirm_proceed(args))
+
+
+class TestParseArgs(unittest.TestCase):
+    def test_defaults(self):
+        with patch.object(sys, "argv", ["cmd", "--source", "/some/path"]):
+            args = soi.parse_args()
+        self.assertEqual(args.source, "/some/path")
+        self.assertEqual(args.engine, "suricata")
+        self.assertFalse(args.dry_run)
+        self.assertFalse(args.no_import_note)
+        self.assertEqual(args.index, soi.DEFAULT_INDEX)
+
+    def test_all_options(self):
+        argv = ["cmd", "-s", "/x", "-e", "suricata", "-n",
+                "--no-import-note", "-i", "alt-index"]
+        with patch.object(sys, "argv", argv):
+            args = soi.parse_args()
+        self.assertEqual(args.source, "/x")
+        self.assertTrue(args.dry_run)
+        self.assertTrue(args.no_import_note)
+        self.assertEqual(args.index, "alt-index")
+
+
+class TestMain(unittest.TestCase):
+    def setUp(self):
+        self.tmpdir = tempfile.mkdtemp()
+        self.addCleanup(shutil.rmtree, self.tmpdir, ignore_errors=True)
+        # Stub make_session so tests don't need /opt/so/conf/elasticsearch/curl.config.
+        p = patch.object(soi, "make_session", return_value=MagicMock())
+        p.start()
+        self.addCleanup(p.stop)
+
+    def _write_file(self, public_id, overrides, ext="txt"):
+        """Write an NDJSON override file. Entries may be dicts or raw strings (for malformed input)."""
+        path = os.path.join(self.tmpdir, f"{public_id}.{ext}")
+        with open(path, "w") as f:
+            for o in overrides:
+                f.write(o if isinstance(o, str) else json.dumps(o))
+                f.write("\n")
+        return path
+
+    def _run_main(self, *extra_argv, input_response="yes"):
+        """Run main() with stdout/stderr captured and input mocked. Returns (stdout, stderr, exit_code)."""
+        argv = ["cmd", "--source", self.tmpdir, *extra_argv]
+        out, err = StringIO(), StringIO()
+        with patch.object(sys, "argv", argv), \
+                patch("sys.stdout", new=out), \
+                patch("sys.stderr", new=err), \
+                patch("builtins.input", return_value=input_response):
+            with self.assertRaises(SystemExit) as cm:
+                soi.main()
+        return out.getvalue(), err.getvalue(), cm.exception.code
+
+    def test_source_dir_missing(self):
+        argv = ["cmd", "--source", "/no/such/path/here"]
+        err = StringIO()
+        with patch.object(sys, "argv", argv), patch("sys.stderr", new=err):
+            with self.assertRaises(SystemExit) as cm:
+                soi.main()
+        self.assertEqual(cm.exception.code, 1)
+        self.assertIn("source directory not found", err.getvalue())
+
+    def test_no_files_found(self):
+        out, _, code = self._run_main()
+        self.assertEqual(code, 0)
+        self.assertIn("No *.txt files found", out)
+
+    def test_user_aborts(self):
+        self._write_file("1001", [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}])
+        out, _, code = self._run_main(input_response="no")
+        self.assertEqual(code, 1)
+        self.assertIn("Aborted", out)
+
+    def test_parse_error_increments_error(self):
+        # Malformed JSON line — parse_overrides_file raises JSONDecodeError.
+        self._write_file("1002", ["not json"])
+        out, _, code = self._run_main("--dry-run")
+        self.assertEqual(code, 1)  # invalid+error → non-zero
+        self.assertIn("could not parse", out)
+        self.assertIn("Errors:                    1", out)
+
+    def test_empty_file_skipped(self):
+        # Blank lines only — parse_overrides_file returns []; main reports "empty file" and continues.
+        path = os.path.join(self.tmpdir, "1003.txt")
+        with open(path, "w") as f:
+            f.write("\n\n")
+        out, _, code = self._run_main("--dry-run")
+        self.assertEqual(code, 0)
+        self.assertIn("empty file", out)
+
+    @patch.object(soi, "find_detection")
+    def test_search_http_error(self, mock_find):
+        mock_find.side_effect = requests.HTTPError("boom")
+        self._write_file("1004", [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}])
+        out, _, code = self._run_main("--dry-run")
+        self.assertEqual(code, 1)
+        self.assertIn("search failed", out)
+
+    @patch.object(soi, "find_detection")
+    def test_no_detection_found(self, mock_find):
+        mock_find.return_value = (None, None, None)
+        self._write_file("1005", [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}])
+        out, _, code = self._run_main("--dry-run")
+        self.assertEqual(code, 0)
+        self.assertIn("no detection found", out)
+        self.assertIn("Skipped (no detection):    1", out)
+
+    @patch.object(soi, "find_detection")
+    def test_all_duplicates_no_update(self, mock_find):
+        existing = [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}]
+        mock_find.return_value = ("doc1", "so-detection", existing)
+        self._write_file("1006", [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}])
+        out, _, code = self._run_main("--dry-run")
+        self.assertEqual(code, 0)
+        self.assertIn("SKIP", out)
+        self.assertNotIn("DRY-RUN: would update", out)  # added_this_file == 0 branch
+
+    @patch.object(soi, "update_overrides")
+    @patch.object(soi, "find_detection")
+    def test_happy_path_full(self, mock_find, mock_update):
+        # Exercises: ADD, dedupe SKIP, INVALID, note prefix, UPDATE, custom-vars warning, exit=1 (invalid present)
+        existing = [{"type": "suppress", "track": "by_src", "ip": "9.9.9.9"}]
+        mock_find.return_value = ("doc1", "so-detection", existing)
+        mock_update.return_value = {"result": "updated"}
+        self._write_file("1007", [
+            {"type": "suppress", "track": "by_src", "ip": "1.2.3.4"},                # ADD
+            {"type": "suppress", "track": "by_src", "ip": "9.9.9.9"},                # SKIP (dupe of existing)
+            {"type": "suppress", "track": "bogus",  "ip": "1.2.3.4"},                # INVALID
+            {"type": "suppress", "track": "by_src", "ip": "$CONCOURSEWORKERS"},      # ADD + custom var
+        ])
+        out, _, code = self._run_main()
+        self.assertEqual(code, 1)  # one invalid -> non-zero
+
+        mock_update.assert_called_once()
+        merged = mock_update.call_args[0][3]
+        self.assertEqual(len(merged), 3)  # 1 existing + 2 new
+        new_notes = [o.get("note", "") for o in merged if o.get("ip") in ("1.2.3.4", "$CONCOURSEWORKERS")]
+        self.assertTrue(all(n.startswith("[Imported ") for n in new_notes))
+
+        self.assertIn("ADD", out)
+        self.assertIn("SKIP", out)
+        self.assertIn("INVALID", out)
+        self.assertIn("UPDATED", out)
+        self.assertIn("$CONCOURSEWORKERS", out)
+
+    @patch.object(soi, "update_overrides")
+    @patch.object(soi, "find_detection")
+    def test_no_import_note_preserves_note(self, mock_find, mock_update):
+        mock_find.return_value = ("doc1", "so-detection", [])
+        mock_update.return_value = {"result": "updated"}
+        self._write_file("1008", [
+            {"type": "suppress", "track": "by_src", "ip": "1.2.3.4", "note": "original"},
+        ])
+        _, _, code = self._run_main("--no-import-note")
+        self.assertEqual(code, 0)
+        merged = mock_update.call_args[0][3]
+        self.assertEqual(merged[0]["note"], "original")  # no prefix applied
+
+    @patch.object(soi, "find_detection")
+    def test_dry_run_skips_update(self, mock_find):
+        mock_find.return_value = ("doc1", "so-detection", [])
+        self._write_file("1009", [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}])
+        with patch.object(soi, "update_overrides") as mock_update:
+            out, _, code = self._run_main("--dry-run")
+        self.assertEqual(code, 0)
+        mock_update.assert_not_called()
+        self.assertIn("DRY-RUN: would update", out)
+
+    @patch.object(soi, "update_overrides")
+    @patch.object(soi, "find_detection")
+    def test_update_http_error(self, mock_find, mock_update):
+        mock_find.return_value = ("doc1", "so-detection", [])
+        mock_update.side_effect = requests.HTTPError("nope")
+        self._write_file("1010", [{"type": "suppress", "track": "by_src", "ip": "1.2.3.4"}])
+        out, _, code = self._run_main()
+        self.assertEqual(code, 1)
+        self.assertIn("update failed", out)
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,232 @@
+#!/opt/saltstack/salt/bin/python3
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+"""
+so-push-drainer
+===============
+
+Scheduled drainer for the active-push feature. Runs on the manager every
+drain_interval seconds (default 15) via a salt schedule in salt/schedule.sls.
+
+For each intent file under /opt/so/state/push_pending/*.json whose last_touch
+is older than debounce_seconds, this script:
+  * concatenates the actions lists from every ready intent
+  * dedupes by (state or __highstate__, tgt, tgt_type)
+  * dispatches a single `salt-run state.orchestrate orch.push_batch --async`
+    with the deduped actions list passed as pillar kwargs
+  * deletes the contributed intent files on successful dispatch
+
+Reactor sls files (push_suricata, push_strelka, push_pillar) write intents
+but never dispatch directly -- see plan
+/home/mreeves/.claude/plans/goofy-marinating-hummingbird.md for the full design.
+"""
+
+import fcntl
+import glob
+import json
+import logging
+import logging.handlers
+import os
+import subprocess
+import sys
+import time
+
+import salt.client
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+LOG_FILE = '/opt/so/log/salt/so-push-drainer.log'
+
+HIGHSTATE_SENTINEL = '__highstate__'
+
+
+def _make_logger():
+    logger = logging.getLogger('so-push-drainer')
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+        handler = logging.handlers.RotatingFileHandler(
+            LOG_FILE, maxBytes=5 * 1024 * 1024, backupCount=3,
+        )
+        handler.setFormatter(logging.Formatter(
+            '%(asctime)s | %(levelname)s | %(message)s',
+        ))
+        logger.addHandler(handler)
+    return logger
+
+
+def _load_push_cfg():
+    """Read the global:push pillar subtree via salt-call. Returns a dict."""
+    caller = salt.client.Caller()
+    cfg = caller.cmd('pillar.get', 'global:push', {})
+    return cfg if isinstance(cfg, dict) else {}
+
+
+def _read_intent(path, log):
+    try:
+        with open(path, 'r') as f:
+            return json.load(f)
+    except (IOError, ValueError) as exc:
+        log.warning('cannot read intent %s: %s', path, exc)
+        return None
+    except Exception:
+        log.exception('unexpected error reading %s', path)
+        return None
+
+
+def _dedupe_actions(actions):
+    seen = set()
+    deduped = []
+    for action in actions:
+        if not isinstance(action, dict):
+            continue
+        state_key = HIGHSTATE_SENTINEL if action.get('highstate') else action.get('state')
+        tgt = action.get('tgt')
+        tgt_type = action.get('tgt_type', 'compound')
+        if not state_key or not tgt:
+            continue
+        key = (state_key, tgt, tgt_type)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(action)
+    return deduped
+
+
+def _dispatch(actions, log):
+    pillar_arg = json.dumps({'actions': actions})
+    cmd = [
+        'salt-run',
+        'state.orchestrate',
+        'orch.push_batch',
+        'pillar={}'.format(pillar_arg),
+        '--async',
+    ]
+    log.info('dispatching: %s', ' '.join(cmd[:3]) + ' pillar=<{} actions>'.format(len(actions)))
+    try:
+        result = subprocess.run(
+            cmd, check=True, capture_output=True, text=True, timeout=60,
+        )
+    except subprocess.CalledProcessError as exc:
+        log.error('dispatch failed (rc=%s): stdout=%s stderr=%s',
+                  exc.returncode, exc.stdout, exc.stderr)
+        return False
+    except subprocess.TimeoutExpired:
+        log.error('dispatch timed out after 60s')
+        return False
+    except Exception:
+        log.exception('dispatch raised')
+        return False
+    log.info('dispatch accepted: %s', (result.stdout or '').strip())
+    return True
+
+
+def main():
+    log = _make_logger()
+
+    if not os.path.isdir(PENDING_DIR):
+        # Nothing to do; reactors create the dir on first use.
+        return 0
+
+    try:
+        push = _load_push_cfg()
+    except Exception:
+        log.exception('failed to read global:push pillar; aborting drain pass')
+        return 1
+
+    if not push.get('enabled', True):
+        log.debug('push disabled; exiting')
+        return 0
+
+    debounce_seconds = int(push.get('debounce_seconds', 30))
+
+    os.makedirs(PENDING_DIR, exist_ok=True)
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent_files = [
+            p for p in sorted(glob.glob(os.path.join(PENDING_DIR, '*.json')))
+            if os.path.basename(p) != '.lock'
+        ]
+        if not intent_files:
+            return 0
+
+        now = time.time()
+        ready = []
+        skipped = 0
+        broken = []
+        for path in intent_files:
+            intent = _read_intent(path, log)
+            if not isinstance(intent, dict):
+                broken.append(path)
+                continue
+            last_touch = intent.get('last_touch', 0)
+            if now - last_touch < debounce_seconds:
+                skipped += 1
+                continue
+            ready.append((path, intent))
+
+        for path in broken:
+            try:
+                os.unlink(path)
+            except OSError:
+                pass
+
+        if not ready:
+            if skipped:
+                log.debug('no ready intents (%d still in debounce window)', skipped)
+            return 0
+
+        combined_actions = []
+        oldest_first_touch = now
+        all_paths = []
+        for path, intent in ready:
+            combined_actions.extend(intent.get('actions', []) or [])
+            first = intent.get('first_touch', now)
+            if first < oldest_first_touch:
+                oldest_first_touch = first
+            all_paths.extend(intent.get('paths', []) or [])
+
+        deduped = _dedupe_actions(combined_actions)
+        if not deduped:
+            log.warning('%d intent(s) had no usable actions; clearing', len(ready))
+            for path, _ in ready:
+                try:
+                    os.unlink(path)
+                except OSError:
+                    pass
+            return 0
+
+        debounce_duration = now - oldest_first_touch
+        log.info(
+            'draining %d intent(s): %d action(s) after dedupe (raw=%d), '
+            'debounce_duration=%.1fs, paths=%s',
+            len(ready), len(deduped), len(combined_actions),
+            debounce_duration, all_paths[:20],
+        )
+
+        if not _dispatch(deduped, log):
+            log.warning('dispatch failed; leaving intent files in place for retry')
+            return 1
+
+        for path, _ in ready:
+            try:
+                os.unlink(path)
+            except OSError:
+                log.exception('failed to remove drained intent %s', path)
+
+        return 0
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
@@ -16,6 +16,7 @@ POSTVERSION=$INSTALLEDVERSION
 INSTALLEDSALTVERSION=$(salt --versions-report | grep Salt: | awk '{print $2}')
 BATCHSIZE=5
 SOUP_LOG=/root/soup.log
+SOUP_DEBUG_LOG=/root/soup-debug.log
 WHATWOULDYOUSAYYAHDOHERE=soup
 whiptail_title='Security Onion UPdater'
 NOTIFYCUSTOMELASTICCONFIG=false
@@ -34,6 +35,7 @@ if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
 fi
 # used to display messages to the user at the end of soup
 declare -a FINAL_MESSAGE_QUEUE=()
+SOUP_ERR_CONTEXT=


 check_err() {
@@ -114,11 +116,52 @@ check_err() {
      echo "$err_msg"
    fi

+    if [[ -n $SOUP_ERR_CONTEXT ]]; then
+      echo ""
+      printf '%s\n' "$SOUP_ERR_CONTEXT"
+    fi
+
+    echo "SOUP XTRACE debug log (if enabled) at $SOUP_DEBUG_LOG. Re-run soup with SOUP_DEBUG=1 to create $SOUP_DEBUG_LOG"
+
    exit $exit_code
  fi

 }

+# Collect bash error context before passing off to check_err()
+on_err() {
+  local exit_code=$?
+  # Ignore failures in blocks that explicitly disabled errexit with `set +e`.
+  [[ $- == *e* ]] || return $exit_code
+  # turn off xtrace to prevent added noise in debug log
+  set +x 2>/dev/null || true
+
+  # Use first error context, multiple errors can happen with command substitutions or nested functions. We just need context from the initial error.
+  [[ -n $SOUP_ERR_CONTEXT ]] && return $exit_code
+
+  local cmd=$BASH_COMMAND
+  local line=${BASH_LINENO[0]}
+  local function=${FUNCNAME[1]:-main}
+  local source=${BASH_SOURCE[1]##*/}
+  local -a err_lines=(
+    "ERROR on: ${cmd}"
+    "  source: ${source}:${line} in ${function}()"
+  )
+  local i caller_line caller_src caller_func
+
+  for ((i=2; i<${#FUNCNAME[@]}-1; i++)); do
+    caller_line=${BASH_LINENO[$((i-1))]}
+    [[ -n $caller_line && $caller_line -gt 0 ]] || continue
+    caller_src=${BASH_SOURCE[$i]##*/}
+    caller_func=${FUNCNAME[$i]:-main}
+    err_lines+=("  called by: ${caller_src}:${caller_line} in ${caller_func}()")
+  done
+
+  SOUP_ERR_CONTEXT=$(printf '%s\n' "${err_lines[@]}")
+
+  return $exit_code
+}
+
 airgap_mounted() {
  # Let's see if the ISO is already mounted.
  if [[ -f /tmp/soagupdate/SecurityOnion/VERSION ]]; then
@@ -188,13 +231,6 @@ airgap_update_dockers() {
  fi
 }

-backup_old_states_pillars() {
-
-	tar czf /nsm/backup/$(echo $INSTALLEDVERSION)_$(date +%Y%m%d-%H%M%S)_soup_default_states_pillars.tar.gz /opt/so/saltstack/default/
-	tar czf /nsm/backup/$(echo $INSTALLEDVERSION)_$(date +%Y%m%d-%H%M%S)_soup_local_states_pillars.tar.gz /opt/so/saltstack/local/
-
-}
-
 update_registry() {
  docker stop so-dockerregistry
  docker rm so-dockerregistry
@@ -350,10 +386,11 @@ highstate() {
 masterlock() {
  echo "Locking Salt Master"
  mv -v $TOPFILE $BACKUPTOPFILE
-  echo "base:" > $TOPFILE
-  echo "  $MINIONID:" >> $TOPFILE
-  echo "    - ca" >> $TOPFILE
-  echo "    - elasticsearch" >> $TOPFILE
+  # Render the real top file only for the host running soup; every other
+  # minion gets an empty top (no states) while the master is upgrading.
+  echo "{% if grains['id'] == '$MINIONID' %}" > $TOPFILE
+  cat $BACKUPTOPFILE >> $TOPFILE
+  echo "{% endif %}" >> $TOPFILE
 }

 masterunlock() {
@@ -370,8 +407,9 @@ preupgrade_changes() {
    # This function is to add any new pillar items if needed.
    echo "Checking to see if changes are needed."

-    [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0   
+    [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0
    [[ "$INSTALLEDVERSION" == "3.0.0" ]] && up_to_3.1.0
+    [[ "$INSTALLEDVERSION" == "3.1.0" ]] && up_to_3.2.0
    true
 }

@@ -381,6 +419,7 @@ postupgrade_changes() {

    [[ "$POSTVERSION" =~ ^2\.4\.21[0-9]+$ ]] && post_to_3.0.0
    [[ "$POSTVERSION" == "3.0.0" ]] && post_to_3.1.0
+    [[ "$POSTVERSION" == "3.1.0" ]] && post_to_3.2.0
    true
 }

@@ -533,6 +572,23 @@ elasticfleet_set_agent_logging_level_warn() {
    done <<< "$policies_to_update"
 }

+update_logstash_pipeline_name() {
+    local original_pipeline_name="$1"
+    local new_pipeline_name="$2"
+
+    echo "Checking for conflicting logstash defined_pipelines pillar value."
+    local LOGSTASH_FILE=/opt/so/saltstack/local/pillar/logstash/soc_logstash.sls
+    local MINIONDIR=/opt/so/saltstack/local/pillar/minions
+    for pillar_file in "$LOGSTASH_FILE" "$MINIONDIR"/*.sls; do
+        [[ -f "$pillar_file" ]] || continue
+        if grep -q "$original_pipeline_name$" "$pillar_file"; then
+            echo "Found conflicting defined_pipeline pillar value in $pillar_file. Updating to use the new logstash pipeline name."
+            sed -i "s#$original_pipeline_name\$#$new_pipeline_name#g" "$pillar_file"
+            chown socore:socore "$pillar_file"
+        fi
+    done
+}
+
 check_transform_health_and_reauthorize() {
    . /usr/sbin/so-elastic-fleet-common

@@ -556,14 +612,23 @@ check_transform_health_and_reauthorize() {
    # - unhealthy (any non-green health status)
    # - metadata has run_as_kibana_system: false (this fix is specific to transforms started prior to Kibana 9.3.3)
    # - are not orphaned (integration is not somehow missing/corrupt/uninstalled)
+    local tmp_transforms tmp_stats tmp_installed
+    tmp_transforms=$(mktemp)
+    tmp_stats=$(mktemp)
+    tmp_installed=$(mktemp)
+
+    echo "$transforms_doc" > "$tmp_transforms"
+    echo "$stats_doc"      > "$tmp_stats"
+    echo "$installed_doc"  > "$tmp_installed"
+
    local unhealthy_transforms
    unhealthy_transforms=$(jq -c -n \
-        --argjson t "$transforms_doc" \
-        --argjson s "$stats_doc" \
-        --argjson i "$installed_doc" '
-        ($i.items | map({key: .name, value: .version}) | from_entries) as $pkg_ver
-        | ($s.transforms | map({key: .id, value: .health.status}) | from_entries) as $health
-        | [ $t.transforms[]
+        --slurpfile t "$tmp_transforms" \
+        --slurpfile s "$tmp_stats" \
+        --slurpfile i "$tmp_installed" '
+        ($i[0].items | map({key: .name, value: .version}) | from_entries) as $pkg_ver
+        | ($s[0].transforms | map({key: .id, value: .health.status}) | from_entries) as $health
+        | [ $t[0].transforms[]
            | select(._meta.run_as_kibana_system == false)
            | select(($health[.id] // "unknown") != "green")
            | {id, pkg: ._meta.package.name, ver: ($pkg_ver[._meta.package.name])}
@@ -604,6 +669,8 @@ check_transform_health_and_reauthorize() {
        (( total_failures += $(jq 'map(select(.success != true)) | length' <<< "$resp" 2>/dev/null) ))
    done <<< "$unhealthy_transforms"

+    rm -f "$tmp_transforms" "$tmp_stats" "$tmp_installed"
+
    if [[ "$total_failures" -gt 0 ]]; then
        echo "Some transform(s) failed to reauthorize."
    fi
@@ -644,6 +711,31 @@ ensure_postgres_secret() {
  chown socore:socore "$secrets_file"
 }

+rename_strelka_scan_lnk() {
+  echo "Renaming strelka pillar ScanLNK to ScanLnk."
+  local STRELKA_FILE=/opt/so/saltstack/local/pillar/strelka/soc_strelka.sls
+  local MINIONDIR=/opt/so/saltstack/local/pillar/minions
+  local OLD_KEY=strelka.backend.config.backend.scanners.ScanLNK
+  local NEW_KEY=strelka.backend.config.backend.scanners.ScanLnk
+  local TMP_VALUE_FILE
+  TMP_VALUE_FILE=$(mktemp)
+
+  for pillar_file in "$STRELKA_FILE" "$MINIONDIR"/*.sls; do
+    [[ -f "$pillar_file" ]] || continue
+    # Skip if ScanLNK doesn't exist
+    so-yaml.py get "$pillar_file" "$OLD_KEY" > "$TMP_VALUE_FILE" 2>/dev/null || continue
+    echo "Found 'ScanLNK' key in $pillar_file. Renaming to 'ScanLnk'."
+    so-yaml.py add "$pillar_file" "$NEW_KEY" "file:$TMP_VALUE_FILE"
+    so-yaml.py remove "$pillar_file" "$OLD_KEY"
+  done
+
+  rm -f "$TMP_VALUE_FILE"
+}
+
+fix_logstash_0013_lumberjack_pipeline_name() {
+    update_logstash_pipeline_name "so/0013_input_lumberjack_fleet.conf" "so/0013_input_lumberjack_fleet.conf.jinja"
+}
+
 up_to_3.1.0() {
  ensure_postgres_local_pillar
  ensure_postgres_secret
@@ -651,7 +743,8 @@ up_to_3.1.0() {
  elasticsearch_backup_index_templates
  # Clear existing component template state file.
  rm -f /opt/so/state/esfleet_component_templates.json
-
+  rename_strelka_scan_lnk
+  fix_logstash_0013_lumberjack_pipeline_name

  INSTALLEDVERSION=3.1.0
 }
@@ -688,6 +781,97 @@ post_to_3.1.0() {

 ### 3.1.0 End ###

+### 3.2.0 Scripts ###
+
+bootstrap_so_soc_database() {
+  # init-db.sh is mounted into so-postgres at /docker-entrypoint-initdb.d/init-db.sh
+  # and runs automatically only on a fresh data directory. Hosts upgrading from
+  # 3.1.0 already have /nsm/postgres populated, so the so_soc bootstrap block
+  # added in 3.2 never fires. Re-run the script explicitly; it's idempotent.
+  echo "Bootstrapping so_soc database via init-db.sh."
+  # The postgres image has no USER directive, so `docker exec` defaults to
+  # root, and the container env intentionally omits POSTGRES_USER (the upstream
+  # entrypoint defaults it transiently during first-init only). Recreate both
+  # so psql inside init-db.sh resolves the connect user correctly.
+  local exec_cmd="docker exec -u postgres -e POSTGRES_USER=postgres so-postgres bash /docker-entrypoint-initdb.d/init-db.sh"
+  if ! /usr/sbin/so-postgres-wait; then
+    FINAL_MESSAGE_QUEUE+=("WARNING: so-postgres was not ready during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd")
+    return 0
+  fi
+  if ! $exec_cmd; then
+    FINAL_MESSAGE_QUEUE+=("WARNING: init-db.sh failed inside so-postgres during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd")
+    return 0
+  fi
+  echo "so_soc bootstrap complete."
+}
+
+# Existing grids should keep ILM unless an admin explicitly opts in to DLM.
+pin_elasticsearch_data_retention_method() {
+  local elasticsearch_file=/opt/so/saltstack/local/pillar/elasticsearch/soc_elasticsearch.sls
+  mkdir -p "$(dirname "$elasticsearch_file")"
+  [[ -f "$elasticsearch_file" ]] || touch "$elasticsearch_file"
+
+  if so-yaml.py get -r "$elasticsearch_file" elasticsearch.data_retention_method >/dev/null 2>&1; then
+    echo "elasticsearch.data_retention_method already set; leaving as-is."
+    return 0
+  fi
+
+  echo "Pinning existing grid to ILM data retention."
+  so-yaml.py add "$elasticsearch_file" elasticsearch.data_retention_method ILM
+  chown socore:socore "$elasticsearch_file"
+}
+
+# Addes auto_expand_replicas setting to .kibana_streams index template
+#
+# In Kibana 9.3.3 the auto_expand_replicas setting was not added to the .kibana_streams index template. Causing single node deployments to be stuck in yellow state (unable to assign replica). Here we update the template in place using the so_kibana system user (system managed index template) to include the auto_expand_replicas setting
+#
+# Reference: https://github.com/elastic/kibana/issues/263048
+kibana_backport_streams_index_template() {
+    local current_template updated_template
+
+    set +e
+    if ! current_template=$(so-elasticsearch-query "_index_template/.kibana_streams" --retry 3 --retry-delay 5 --fail); then
+        echo "Index template .kibana_streams does not exist, skipping backport."
+        return 0
+    fi
+    set -e
+
+    updated_template=$(jq '.index_templates[0].index_template | .template.settings += {"index.auto_expand_replicas": "0-1"} | del(.created_date_millis, .modified_date_millis)' <<< "$current_template")
+
+    if ! kibana_user_pass=$(/usr/sbin/so-yaml.py get -r /opt/so/saltstack/local/pillar/elasticsearch/auth.sls elasticsearch.auth.users.so_kibana_user.pass); then
+        echo "Unable to retrieve so_kibana_user password, skipping .kibana_streams index template backport."
+        return 0
+    fi
+
+    if ! so-elasticsearch-query "_index_template/.kibana_streams" -XPUT -d "$updated_template" -u "so_kibana:$kibana_user_pass" --retry 3 --retry-delay 5 --fail; then
+        echo "Unable to automatically update .kibana_streams index template"
+        return 0
+    fi
+
+}
+
+up_to_3.2.0() {
+  fix_logstash_0013_lumberjack_pipeline_name
+
+  pin_elasticsearch_data_retention_method
+
+  INSTALLEDVERSION=3.2.0
+}
+
+post_to_3.2.0() {
+  bootstrap_so_soc_database
+
+  # Including agent regen script here since it was missed in post_to_3.1.0
+  echo "Regenerating Elastic Agent Installers"
+  /sbin/so-elastic-agent-gen-installers
+
+  kibana_backport_streams_index_template
+
+  POSTVERSION=3.2.0
+}
+
+### 3.2.0 End ###
+

 repo_sync() {
  echo "Sync the local repo."
@@ -939,6 +1123,9 @@ verify_es_version_compatibility() {
    local is_active_intermediate_upgrade=1
    # supported upgrade paths for SO-ES versions
    declare -A es_upgrade_map=(
+        ["8.18.4"]="8.18.6 8.18.8 9.0.8"
+	    ["8.18.6"]="8.18.8 9.0.8"
+	    ["8.18.8"]="9.0.8"
        ["9.0.8"]="9.3.3"
    )

@@ -962,6 +1149,171 @@ verify_es_version_compatibility() {
        exit 160
    fi

+    compatible_es_versions="$target_es_version"
+    for current_version in "${!es_upgrade_map[@]}"; do
+        # shellcheck disable=SC2076
+        if [[ " ${es_upgrade_map[$current_version]} " =~ " $target_es_version " ]]; then
+            compatible_es_versions+=" $current_version"
+        fi
+    done
+
+    # Check if the given ES version can directly upgrade to the target ES version. Used to assist with catching lagging nodes during the upgrade process
+    es_version_can_upgrade_to_target() {
+        local current_version="$1"
+        # shellcheck disable=SC2076
+        if [[ -n "$current_version" && " $compatible_es_versions " =~ " $current_version " ]]; then
+            return 0
+        fi
+
+        return 1
+    }
+
+    # Gather Elasticsearch cluster version info and verify that each node in the cluster is running a version compatible with the target ES version.
+    verify_searchnodes_es_target_compatibility() {
+        local retries=20
+        local retry_count=0
+        local delay=180
+        local expected_es_nodes searchnode_minions attempt
+        local searchnode_discovery_success=false
+        SEARCHNODE_ES_VERSIONS=""
+
+        for attempt in {1..3}; do
+            if searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then
+                searchnode_discovery_success=true
+                break
+            fi
+
+            echo "Failed to retrieve grid searchnodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3."
+            sleep 30
+        done
+
+        if [[ "$searchnode_discovery_success" != "true" ]]; then
+            echo "Failed to retrieve grid searchnodes via salt-key."
+            return 1
+        fi
+
+        # Always add node running soup to expected es nodes
+        expected_es_nodes="${MINIONID%_*}"
+        while IFS= read -r searchnode_minion; do
+            [[ -z "$searchnode_minion" ]] && continue
+            expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}"
+        done <<< "$searchnode_minions"
+
+        while [[ $retry_count -lt $retries ]]; do
+            SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1)
+            local exit_status=$?
+
+            if [[ $exit_status -ne 0 ]]; then
+                echo "Failed to retrieve Elasticsearch versions from searchnodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+                ((retry_count++))
+                sleep $delay
+                continue
+            fi
+
+            local all_searchnodes_compatible=true
+            while IFS=$'\t' read -r node current_version; do
+                [[ -z "$node" ]] && continue
+                if ! es_version_can_upgrade_to_target "$current_version"; then
+                    echo "Searchnode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version."
+                    all_searchnodes_compatible=false
+                fi
+            done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv')
+
+            while IFS= read -r expected_es_node; do
+                [[ -z "$expected_es_node" ]] && continue
+                if ! echo "$SEARCHNODE_ES_VERSIONS" | jq -e --arg node "$expected_es_node" '.nodes | to_entries | any(.value.name == $node)' > /dev/null; then
+                    echo "Searchnode $expected_es_node did not report an Elasticsearch version. It may be offline or still upgrading."
+                    all_searchnodes_compatible=false
+                fi
+            done <<< "$expected_es_nodes"
+
+            if [[ "$all_searchnodes_compatible" == true ]]; then
+                echo "All Searchnodes are upgradable to Elasticsearch $target_es_version."
+                return 0
+            fi
+
+            echo "One or more Searchnodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+            ((retry_count++))
+            sleep $delay
+        done
+
+        return 1
+    }
+
+    # Gather heavynode version info and verify that each node is running a version compatible with the target ES version.
+    verify_heavynodes_es_target_compatibility() {
+        local heavynode_minions attempt
+        local retries=20
+        local retry_count=0
+        local delay=180
+        local heavynode_discovery_success=false
+        HEAVYNODE_ES_VERSIONS=""
+
+        for attempt in {1..3}; do
+            if heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then
+                heavynode_discovery_success=true
+                break
+            fi
+
+            echo "Failed to retrieve grid heavynodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3."
+            sleep 30
+        done
+
+        if [[ "$heavynode_discovery_success" != "true" ]]; then
+            echo "Failed to retrieve grid heavynodes via salt-key."
+            return 1
+        fi
+
+        if [[ -z "$heavynode_minions" ]]; then
+            echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check."
+            return 0
+        fi
+
+        while [[ $retry_count -lt $retries ]]; do
+            HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null)
+            local exit_status=$?
+
+            if [[ $exit_status -ne 0 ]]; then
+                echo "Failed to retrieve Elasticsearch version from one or more heavynodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+                ((retry_count++))
+                sleep $delay
+                continue
+            fi
+
+            local all_heavynodes_compatible=true
+            while IFS=$'\t' read -r node current_version; do
+                [[ -z "$node" ]] && continue
+                if ! es_version_can_upgrade_to_target "$current_version"; then
+                    echo "Heavynode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version."
+                    all_heavynodes_compatible=false
+                fi
+            done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv')
+
+            while IFS= read -r heavynode_minion; do
+                [[ -z "$heavynode_minion" ]] && continue
+                if ! echo "$HEAVYNODE_ES_VERSIONS" | jq -se --arg minion "$heavynode_minion" 'add | has($minion)' > /dev/null; then
+                    echo "Heavynode $heavynode_minion did not report an Elasticsearch version. It may be offline or still upgrading."
+                    all_heavynodes_compatible=false
+                fi
+            done <<< "$heavynode_minions"
+
+            if [[ "$all_heavynodes_compatible" == true ]]; then
+                echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version."
+                return 0
+            fi
+
+            echo "One or more heavynodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+            ((retry_count++))
+            sleep $delay
+        done
+
+        return 1
+    }
+
+    if [[ ! -f "$es_verification_script" ]]; then
+        create_intermediate_upgrade_verification_script "$es_verification_script"
+    fi
+
    for statefile in "${es_required_version_statefile_base}"-*; do
        [[ -f $statefile ]] || continue

@@ -980,10 +1332,6 @@ verify_es_version_compatibility() {
            continue
        fi

-        if [[ ! -f "$es_verification_script" ]]; then
-            create_intermediate_upgrade_verification_script "$es_verification_script"
-        fi
-
        echo -e "\n##############################################################################################################################\n"
        echo "A previously required intermediate Elasticsearch upgrade was detected. Verifying that all Searchnodes/Heavynodes have successfully upgraded Elasticsearch to $es_required_version_statefile_value before proceeding with soup to avoid potential data loss! This command can take up to an hour to complete."
        if ! timeout --foreground 4000 bash "$es_verification_script" "$es_required_version_statefile_value" "$statefile"; then
@@ -1005,6 +1353,26 @@ verify_es_version_compatibility() {

    # shellcheck disable=SC2076 # Do not want a regex here eg usage " 8.18.8 9.0.8 " =~ " 9.0.8 "
    if [[ " ${es_upgrade_map[$es_version]} " =~ " $target_es_version " || "$es_version" == "$target_es_version" ]]; then
+        if ! verify_searchnodes_es_target_compatibility || ! verify_heavynodes_es_target_compatibility; then
+            echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
+
+            echo "One or more Searchnode(s)/Heavynode(s) cannot upgrade directly to Elasticsearch $target_es_version. This can happen with soups that include Elasticsearch upgrades being run in quick succession. Typically, this will resolve itself as the grid synchronizes. Please allow time for all Searchnodes/Heavynodes to have upgraded Elasticsearch to a compatible version with $target_es_version before running soup again to avoid potential data loss!"
+
+            if [[ -n "$HEAVYNODE_ES_VERSIONS" ]]; then
+                echo "Current heavynode Elasticsearch versions:"
+                echo "$HEAVYNODE_ES_VERSIONS" | jq '.'
+            fi
+
+            if [[ -n "$SEARCHNODE_ES_VERSIONS" ]]; then
+                echo "Current searchnode Elasticsearch versions:"
+                echo "$SEARCHNODE_ES_VERSIONS" | jq '.nodes | to_entries | map({(.value.name): .value.version}) | sort | add'
+            fi
+
+            echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
+
+            exit 161
+        fi
+
        # supported upgrade
        return 0
    else
@@ -1290,7 +1658,7 @@ EOF

 # Keeping this block in case we need to do a hotfix that requires salt update
 apply_hotfix() {
-   echo "No actions required. ($INSTALLEDVERSION/$HOTFIXVERSION)"
+    echo "No actions required. ($INSTALLEDVERSION/$HOTFIXVERSION)"
 }

 failed_soup_restore_items() {
@@ -1362,13 +1730,13 @@ main() {
  echo "Verifying we have the latest soup script."
  verify_latest_update_script

-  echo "Verifying Elasticsearch version compatibility before upgrading."
-  verify_es_version_compatibility
-
  echo "Let's see if we need to update Security Onion."
  upgrade_check
  upgrade_space

+  echo "Verifying Elasticsearch version compatibility across the grid before upgrading."
+  verify_es_version_compatibility
+
  echo "Checking for Salt Master and Minion updates."
  upgrade_check_salt
  set -e
@@ -1388,7 +1756,8 @@ main() {
    echo "Applying $HOTFIXVERSION hotfix"
    # since we don't run the backup.config_backup state on import we wont snapshot previous version states and pillars
    if [[ ! "$MINION_ROLE" == "import" ]]; then
-      backup_old_states_pillars
+        echo "Running so-config-backup script."
+        /sbin/so-config-backup
    fi
    copy_new_files
    create_local_directories "/opt/so/saltstack/default"
@@ -1444,8 +1813,8 @@ main() {
    # since we don't run the backup.config_backup state on import we wont snapshot previous version states and pillars
    if [[ ! "$MINION_ROLE" == "import" ]]; then
      echo ""
-      echo "Creating snapshots of default and local Salt states and pillars and saving to /nsm/backup/"
-      backup_old_states_pillars
+      echo "Running so-config-backup script."
+      /sbin/so-config-backup
    fi

    echo ""
@@ -1657,4 +2026,20 @@ EOF
  read -r input
 fi

-main "$@" | tee -a $SOUP_LOG
+set -o errtrace
+trap on_err ERR
+
+if [[ $SOUP_DEBUG == 1 ]]; then
+  if [ -f $SOUP_DEBUG_LOG ]; then
+    current_time=$(date +%Y%m%d.%H%M%S)
+    mv $SOUP_DEBUG_LOG $SOUP_DEBUG_LOG.$INSTALLEDVERSION.$current_time
+  fi
+  exec {SOUP_XTRACE_FD}>>"$SOUP_DEBUG_LOG"
+  export SOUP_XTRACE_FD
+  BASH_XTRACEFD=$SOUP_XTRACE_FD
+  PS4='+ [${BASH_SOURCE##*/}:${LINENO} ${FUNCNAME[0]:-main}()] | '
+  set -x
+  export SOUP_DEBUG
+fi
+
+main "$@" 2>&1 | tee -a $SOUP_LOG
@@ -34,6 +34,7 @@ make-rule-dir-nginx:
 so-nginx:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-nginx:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-nginx
    - networks:
      - sobridge:
@@ -225,6 +225,7 @@ http {
 			limit_req             zone=auth_throttle burst={{ NGINXMERGED.config.throttle_login_burst }} nodelay;
 			limit_req_status      429;
 			proxy_pass            http://{{ GLOBALS.manager }}:4433;
+			proxy_set_header      Connection "Close";
 			proxy_read_timeout    90;
 			proxy_connect_timeout 90;
 			proxy_set_header      Host $host;
@@ -237,6 +238,7 @@ http {
 		location ~ ^/auth/.*?(whoami|logout|settings|errors|webauthn.js) {
 			rewrite               /auth/(.*) /$1 break;
 			proxy_pass            http://{{ GLOBALS.manager }}:4433;
+			proxy_set_header      Connection "Close";
 			proxy_read_timeout    90;
 			proxy_connect_timeout 90;
 			proxy_set_header      Host $host;
@@ -3,7 +3,14 @@
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.

-{% set hypervisor = pillar.minion_id %}
+{% set hypervisor = pillar.get('minion_id', '') %}
+
+{% if not hypervisor|regex_match('^([A-Za-z0-9._-]{1,253})$') %}
+{%   do salt.log.error('delete_hypervisor_orch: refusing unsafe minion_id=' ~ hypervisor) %}
+delete_hypervisor_invalid_minion_id:
+  test.fail_without_changes:
+    - name: delete_hypervisor_invalid_minion_id
+{% else %}

 ensure_hypervisor_mine_deleted:
  salt.function:
@@ -20,3 +27,5 @@ update_salt_cloud_profile:
    - sls:
      - salt.cloud.config
    - concurrent: True
+
+{% endif %}
@@ -0,0 +1,37 @@
+{% from 'global/map.jinja' import GLOBALMERGED %}
+{% set actions = salt['pillar.get']('actions', []) %}
+{% set BATCH = GLOBALMERGED.push.batch %}
+{% set BATCH_WAIT = GLOBALMERGED.push.batch_wait %}
+
+{% for action in actions %}
+{%   if action.get('highstate') %}
+apply_highstate_{{ loop.index }}:
+  salt.state:
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+    - highstate: True
+    - batch: {{ action.get('batch', BATCH) }}
+    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
+    - kwarg:
+        queue: 2
+{%   else %}
+refresh_pillar_{{ loop.index }}:
+  salt.function:
+    - name: saltutil.refresh_pillar
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+
+apply_{{ action.state | replace('.', '_') }}_{{ loop.index }}:
+  salt.state:
+    - tgt: '{{ action.tgt }}'
+    - tgt_type: {{ action.get('tgt_type', 'compound') }}
+    - sls:
+      - {{ action.state }}
+    - batch: {{ action.get('batch', BATCH) }}
+    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
+    - kwarg:
+        queue: 2
+    - require:
+      - salt: refresh_pillar_{{ loop.index }}
+{%   endif %}
+{% endfor %}
@@ -12,7 +12,14 @@
 {% if 'vrt' in salt['pillar.get']('features', []) %}

 {%   do salt.log.debug('vm_pillar_clean_orch: Running') %}
-{%   set vm_name = pillar.get('vm_name') %}
+{%   set vm_name = pillar.get('vm_name', '') %}
+
+{%   if not vm_name|regex_match('^([A-Za-z0-9._-]{1,253})$') %}
+{%     do salt.log.error('vm_pillar_clean_orch: refusing unsafe vm_name=' ~ vm_name) %}
+vm_pillar_clean_invalid_name:
+  test.fail_without_changes:
+    - name: vm_pillar_clean_invalid_name
+{%   else %}

 delete_adv_{{ vm_name }}_pillar:
  module.run:
@@ -24,6 +31,8 @@ delete_{{ vm_name }}_pillar:
    - file.remove:
      - path: /opt/so/saltstack/local/pillar/minions/{{ vm_name }}.sls

+{%   endif %}
+
 {% else %}

 {%   do salt.log.error(
@@ -46,10 +46,10 @@ postgresinitdir:
    - require:
      - file: postgresconfdir

-postgresinitusers:
+postgresinitdb:
  file.managed:
-    - name: /opt/so/conf/postgres/init/init-users.sh
-    - source: salt://postgres/files/init-users.sh
+    - name: /opt/so/conf/postgres/init/init-db.sh
+    - source: salt://postgres/files/init-db.sh
    - user: 939
    - group: 939
    - mode: 755
@@ -31,7 +31,7 @@ so-postgres:
      - POSTGRES_DB=securityonion
      # Passwords are delivered via mounted 0600 secret files, not plaintext env vars.
      # The upstream postgres image resolves POSTGRES_PASSWORD_FILE; entrypoint.sh and
-      # init-users.sh resolve SO_POSTGRES_PASS_FILE the same way.
+      # init-db.sh resolve SO_POSTGRES_PASS_FILE the same way.
      - POSTGRES_PASSWORD_FILE=/run/secrets/postgres_password
      - SO_POSTGRES_USER={{ SO_POSTGRES_USER }}
      - SO_POSTGRES_PASS_FILE=/run/secrets/so_postgres_pass
@@ -46,7 +46,7 @@ so-postgres:
      - /opt/so/conf/postgres/postgresql.conf:/conf/postgresql.conf:ro
      - /opt/so/conf/postgres/pg_hba.conf:/conf/pg_hba.conf:ro
      - /opt/so/conf/postgres/secrets:/run/secrets:ro
-      - /opt/so/conf/postgres/init/init-users.sh:/docker-entrypoint-initdb.d/init-users.sh:ro
+      - /opt/so/conf/postgres/init/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh:ro
      - /etc/pki/postgres.crt:/conf/postgres.crt:ro
      - /etc/pki/postgres.key:/conf/postgres.key:ro
      - /etc/pki/tls/certs/intca.crt:/conf/ca.crt:ro
@@ -70,7 +70,7 @@ so-postgres:
    - watch:
      - file: postgresconf
      - file: postgreshba
-      - file: postgresinitusers
+      - file: postgresinitdb
      - file: postgres_super_secret
      - file: postgres_app_secret
      - x509: postgres_crt
@@ -78,7 +78,7 @@ so-postgres:
    - require:
      - file: postgresconf
      - file: postgreshba
-      - file: postgresinitusers
+      - file: postgresinitdb
      - file: postgres_super_secret
      - file: postgres_app_secret
      - x509: postgres_crt
@@ -17,6 +17,7 @@ psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-E
        END IF;
    END
    \$\$;
+    GRANT ALL ON SCHEMA public TO "$SO_POSTGRES_USER";
    GRANT ALL PRIVILEGES ON DATABASE "$POSTGRES_DB" TO "$SO_POSTGRES_USER";
    -- Lock the SOC database down at the connect layer; PUBLIC gets CONNECT
    -- by default, which would let per-minion telegraf roles open sessions
@@ -31,4 +32,4 @@ EOSQL
 # only ensures the shared database exists on first initialization.
 if ! psql -U "$POSTGRES_USER" -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
    psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -c "CREATE DATABASE so_telegraf"
-fi
+fi
@@ -18,38 +18,22 @@ include:
 {% set TG_OUT = TELEGRAFMERGED.output | upper %}
 {% if TG_OUT in ['POSTGRES', 'BOTH'] %}

-# docker_container.running returns as soon as the container starts, but on
-# first-init docker-entrypoint.sh starts a temporary postgres with
-# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
-# shuts it down before exec'ing the real CMD. A default pg_isready check
-# (Unix socket) passes during that ephemeral phase and races the shutdown
-# with "the database system is shutting down". Checking TCP readiness on
-# 127.0.0.1 only succeeds after the final postgres binds the port.
 postgres_wait_ready:
  cmd.run:
-    - name: |
-        for i in $(seq 1 60); do
-          if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
-            exit 0
-          fi
-          sleep 2
-        done
-        echo "so-postgres did not accept TCP connections within 120s" >&2
-        exit 1
+    - name: /usr/sbin/so-postgres-wait
    - require:
      - docker_container: so-postgres
+      - file: postgres_sbin

-# Ensure the shared Telegraf database exists. init-users.sh only runs on a
+# Ensure the shared Telegraf database exists. init-db.sh only runs on a
 # fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
 # would otherwise never get so_telegraf.
 postgres_create_telegraf_db:
  cmd.run:
-    - name: |
-        if ! docker exec so-postgres psql -U postgres -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
-          docker exec so-postgres psql -v ON_ERROR_STOP=1 -U postgres -c "CREATE DATABASE so_telegraf"
-        fi
+    - name: /usr/sbin/so-telegraf-postgres create_db
    - require:
      - cmd: postgres_wait_ready
+      - file: postgres_sbin

 # Provision the shared group role and schema once. Every per-minion role is a
 # member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf
@@ -57,68 +41,26 @@ postgres_create_telegraf_db:
 # on first write are owned by the group role and every member can INSERT/SELECT.
 postgres_telegraf_group_role:
  cmd.run:
-    - name: |
-        docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
-        DO $$
-        BEGIN
-            IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN
-                CREATE ROLE so_telegraf NOLOGIN;
-            END IF;
-        END
-        $$;
-        GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf;
-        CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf;
-        GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf;
-        CREATE SCHEMA IF NOT EXISTS partman;
-        CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
-        CREATE EXTENSION IF NOT EXISTS pg_cron;
-        -- Telegraf (running as so_telegraf) calls partman.create_parent()
-        -- on first write of each metric, which needs USAGE on the partman
-        -- schema, EXECUTE on its functions/procedures, and write access to
-        -- partman.part_config so it can register new partitioned parents.
-        GRANT USAGE, CREATE ON SCHEMA partman TO so_telegraf;
-        GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA partman TO so_telegraf;
-        GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA partman TO so_telegraf;
-        GRANT EXECUTE ON ALL PROCEDURES IN SCHEMA partman TO so_telegraf;
-        -- partman creates per-parent template tables (partman.template_*) at
-        -- runtime; default privileges extend DML/sequence access to them.
-        ALTER DEFAULT PRIVILEGES IN SCHEMA partman
-            GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO so_telegraf;
-        ALTER DEFAULT PRIVILEGES IN SCHEMA partman
-            GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO so_telegraf;
-        -- Hourly partman maintenance. cron.schedule is idempotent by jobname.
-        SELECT cron.schedule(
-          'telegraf-partman-maintenance',
-          '17 * * * *',
-          'CALL partman.run_maintenance_proc()'
-        );
-        EOSQL
+    - name: /usr/sbin/so-telegraf-postgres group_role
    - require:
      - cmd: postgres_create_telegraf_db
+      - file: postgres_sbin

 {%   set creds = salt['pillar.get']('telegraf:postgres_creds', {}) %}
 {%   for mid, entry in creds.items() %}
 {%     if entry.get('user') and entry.get('pass') %}
 {%       set u = entry.user %}
-{%       set p = entry.pass | replace("'", "''") %}
+{%       set p = entry.pass %}

 postgres_telegraf_role_{{ u }}:
  cmd.run:
-    - name: |
-        docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
-        DO $$
-        BEGIN
-            IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{{ u }}') THEN
-                EXECUTE format('CREATE ROLE %I WITH LOGIN PASSWORD %L', '{{ u }}', '{{ p }}');
-            ELSE
-                EXECUTE format('ALTER ROLE %I WITH PASSWORD %L', '{{ u }}', '{{ p }}');
-            END IF;
-        END
-        $$;
-        GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}";
-        GRANT so_telegraf TO "{{ u }}";
-        EOSQL
+    - name: /usr/sbin/so-telegraf-postgres user
+    - env:
+      - ROLE_USER: {{ u | tojson }}
+      - ROLE_PASS: {{ p | tojson }}
+    - hide_output: True
    - require:
+      - file: postgres_sbin
      - cmd: postgres_telegraf_group_role

 {%     endif %}
@@ -130,21 +72,12 @@ postgres_telegraf_role_{{ u }}:
 {%   set retention = salt['pillar.get']('postgres:telegraf:retention_days', 14) | int %}
 postgres_telegraf_retention_reconcile:
  cmd.run:
-    - name: |
-        docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
-        DO $$
-        BEGIN
-            IF EXISTS (SELECT 1 FROM pg_catalog.pg_extension WHERE extname = 'pg_partman') THEN
-                UPDATE partman.part_config
-                SET retention = '{{ retention }} days',
-                    retention_keep_table = false
-                WHERE parent_table LIKE 'telegraf.%';
-            END IF;
-        END
-        $$;
-        EOSQL
+    - name: /usr/sbin/so-telegraf-postgres retention
+    - env:
+      - RETENTION_DAYS: {{ retention }}
    - require:
      - cmd: postgres_telegraf_group_role
+      - file: postgres_sbin

 {% endif %}

@@ -7,15 +7,29 @@

 . /usr/sbin/so-common

+# Without pipefail, a pipeline's exit status is gzip's. A failed pg_dumpall would
+# otherwise be masked by a successful gzip, silently producing a valid .gz that
+# holds a truncated dump.
+set -o pipefail
+
 # Backups contain role password hashes and full chat data; keep them 0600.
 umask 0077

 TODAY=$(date '+%Y_%m_%d')
 BACKUPDIR=/nsm/backup
 BACKUPFILE="$BACKUPDIR/so-postgres-backup-$TODAY.sql.gz"
+TMPFILE="$BACKUPFILE.tmp"
 MAXBACKUPS=7
+LOGFILE=/opt/so/log/postgres/backup.log

-mkdir -p $BACKUPDIR
+log() {
+  echo "$(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOGFILE"
+}
+
+mkdir -p "$BACKUPDIR"
+
+# Remove any temp files left behind by a previously crashed run
+rm -f "$BACKUPDIR"/so-postgres-backup-*.sql.gz.tmp

 # Skip if already backed up today
 if [ -f "$BACKUPFILE" ]; then
@@ -27,13 +41,33 @@ if ! docker ps --format '{{.Names}}' | grep -q '^so-postgres$'; then
  exit 0
 fi

-# Dump all databases and roles, compress
-docker exec so-postgres pg_dumpall -U postgres | gzip > "$BACKUPFILE"
+# Always clean up the temp file on exit; the success path clears this trap
+# after the atomic rename so the finished backup is not deleted.
+trap 'rm -f "$TMPFILE"' EXIT

-# Retention cleanup
-NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
+# Dump all databases and roles, compress. Write to a temp file so the final
+# filename only ever appears for a complete, verified backup.
+if ! docker exec so-postgres pg_dumpall -U postgres | gzip > "$TMPFILE"; then
+  log "ERROR: pg_dumpall/gzip failed; backup aborted"
+  exit 1
+fi
+
+# Verify the compressed stream is intact before publishing it
+if ! gzip -t "$TMPFILE"; then
+  log "ERROR: backup failed gzip integrity check; backup aborted"
+  exit 1
+fi
+
+# Atomically publish the verified backup
+mv "$TMPFILE" "$BACKUPFILE"
+trap - EXIT
+log "OK: wrote $BACKUPFILE"
+
+# Retention cleanup (only reached after a successful backup). The glob is
+# restricted to finished backups so an in-progress .tmp can never be counted.
+NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l)
 while [ "$NUMBACKUPS" -gt "$MAXBACKUPS" ]; do
-  OLDEST=$(find $BACKUPDIR -type f -name "so-postgres-backup*" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
+  OLDEST=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
  rm -f "$OLDEST"
-  NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
+  NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l)
 done
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Wait for the so-postgres container to accept TCP connections.
+#
+# docker_container.running returns as soon as the container starts, but on
+# first-init docker-entrypoint.sh starts a temporary postgres with
+# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
+# shuts it down before exec'ing the real CMD. A default pg_isready check
+# (Unix socket) passes during that ephemeral phase and races the shutdown
+# with "the database system is shutting down". Checking TCP readiness on
+# 127.0.0.1 only succeeds after the final postgres binds the port.
+#
+# Usage: so-postgres-wait [iterations] [sleep_seconds]
+# Default: 60 iterations, 2s sleep (~120s total).
+
+ITERATIONS=${1:-60}
+SLEEP_SECONDS=${2:-2}
+
+for i in $(seq 1 "$ITERATIONS"); do
+  if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
+    exit 0
+  fi
+  sleep "$SLEEP_SECONDS"
+done
+
+echo "so-postgres did not accept TCP connections within $((ITERATIONS * SLEEP_SECONDS))s" >&2
+exit 1
@@ -0,0 +1,110 @@
+#!/bin/bash
+set -e
+
+# Provision Telegraf state inside the so-postgres container.
+# Usage: so-telegraf-postgres <subcommand>
+#   create_db    Ensure the so_telegraf database exists.
+#   group_role   Provision the so_telegraf group role, telegraf/partman schemas,
+#                pg_partman, pg_cron, and the hourly partman maintenance job.
+#   user         Create or update a per-minion login role granted to so_telegraf.
+#                Env: ROLE_USER, ROLE_PASS.
+#   retention    Reconcile partman retention on telegraf parents.
+#                Env: RETENTION_DAYS.
+
+cmd="${1:?subcommand required}"
+
+case "$cmd" in
+  create_db)
+    if ! docker exec so-postgres psql -U postgres -tAc \
+        "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
+      docker exec so-postgres psql -v ON_ERROR_STOP=1 -U postgres \
+        -c "CREATE DATABASE so_telegraf"
+    fi
+    ;;
+
+  group_role)
+    docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN
+        CREATE ROLE so_telegraf NOLOGIN;
+    END IF;
+END
+$$;
+GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf;
+CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf;
+GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf;
+CREATE SCHEMA IF NOT EXISTS partman;
+CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
+CREATE EXTENSION IF NOT EXISTS pg_cron;
+-- Telegraf (running as so_telegraf) calls partman.create_parent()
+-- on first write of each metric, which needs USAGE on the partman
+-- schema, EXECUTE on its functions/procedures, and write access to
+-- partman.part_config so it can register new partitioned parents.
+GRANT USAGE, CREATE ON SCHEMA partman TO so_telegraf;
+GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA partman TO so_telegraf;
+GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA partman TO so_telegraf;
+GRANT EXECUTE ON ALL PROCEDURES IN SCHEMA partman TO so_telegraf;
+-- partman creates per-parent template tables (partman.template_*) at
+-- runtime; default privileges extend DML/sequence access to them.
+ALTER DEFAULT PRIVILEGES IN SCHEMA partman
+    GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO so_telegraf;
+ALTER DEFAULT PRIVILEGES IN SCHEMA partman
+    GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO so_telegraf;
+-- Hourly partman maintenance. cron.schedule is idempotent by jobname.
+SELECT cron.schedule(
+  'telegraf-partman-maintenance',
+  '17 * * * *',
+  'CALL partman.run_maintenance_proc()'
+);
+EOSQL
+    ;;
+
+  user)
+    : "${ROLE_USER:?ROLE_USER is required}"
+    : "${ROLE_PASS:?ROLE_PASS is required}"
+    # psql does not substitute :vars inside dollar-quoted strings, so the
+    # conditional CREATE/ALTER is built outside any DO block and dispatched
+    # with \gexec. format() handles identifier/literal quoting.
+    docker exec -i so-postgres psql \
+      -v ON_ERROR_STOP=1 \
+      -v role_user="$ROLE_USER" \
+      -v role_pass="$ROLE_PASS" \
+      -U postgres -d so_telegraf <<'EOSQL'
+SELECT format(
+  CASE WHEN EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = :'role_user')
+       THEN 'ALTER ROLE %I WITH LOGIN PASSWORD %L'
+       ELSE 'CREATE ROLE %I WITH LOGIN PASSWORD %L'
+  END,
+  :'role_user',
+  :'role_pass'
+) \gexec
+GRANT CONNECT ON DATABASE so_telegraf TO :"role_user";
+GRANT so_telegraf TO :"role_user";
+EOSQL
+    ;;
+
+  retention)
+    : "${RETENTION_DAYS:?RETENTION_DAYS is required}"
+    # \gset + \if guards against a missing pg_partman without using a DO
+    # block (psql :var substitution doesn't reach into dollar-quoted code).
+    docker exec -i so-postgres psql \
+      -v ON_ERROR_STOP=1 \
+      -v retention_days="$RETENTION_DAYS" \
+      -U postgres -d so_telegraf <<'EOSQL'
+SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_catalog.pg_extension WHERE extname = 'pg_partman')
+            THEN 'true' ELSE 'false' END AS has_partman \gset
+\if :has_partman
+UPDATE partman.part_config
+SET retention = :'retention_days' || ' days',
+    retention_keep_table = false
+WHERE parent_table LIKE 'telegraf.%';
+\endif
+EOSQL
+    ;;
+
+  *)
+    echo "Unknown subcommand: $cmd" >&2
+    exit 1
+    ;;
+esac
@@ -3,12 +3,15 @@
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.

-{% if data['id'].endswith('_hypervisor') and data['result'] == True %}
+{% set hid = data['id'] %}
+{% if hid|regex_match('^([A-Za-z0-9._-]{1,253})$')
+   and hid.endswith('_hypervisor')
+   and data['result'] == True %}

 {%   if data['act'] == 'accept' %}
 check_and_trigger:
  runner.setup_hypervisor.setup_environment:
-    - minion_id: {{ data['id'] }}
+    - minion_id: {{ hid }}
 {%   endif %}

 {%   if data['act'] == 'delete' %}
@@ -17,8 +20,7 @@ delete_hypervisor:
    - args:
      - mods: orch.delete_hypervisor
      - pillar:
-          minion_id: {{ data['id'] }}
+          minion_id: {{ hid }}
 {%   endif %}

 {% endif %}
-
@@ -1,7 +1,7 @@
 #!py

 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
-# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.

@@ -9,30 +9,42 @@ import logging
 import os
 import pwd
 import grp
+import re
+
+log = logging.getLogger(__name__)
+
+PILLAR_ROOT = '/opt/so/saltstack/local/pillar/minions/'
+_VMNAME_RE = re.compile(r'^[A-Za-z0-9._-]{1,253}$')
+

 def run():
-  vm_name = data['kwargs']['name']
-  logging.error("createEmptyPillar reactor: vm_name: %s" % vm_name)
-  pillar_root = '/opt/so/saltstack/local/pillar/minions/'
+  vm_name = data.get('kwargs', {}).get('name', '')
+  if not _VMNAME_RE.match(str(vm_name)):
+    log.error("createEmptyPillar reactor: refusing unsafe vm_name=%r", vm_name)
+    return {}
+
+  log.info("createEmptyPillar reactor: vm_name: %s", vm_name)
  pillar_files = ['adv_' + vm_name + '.sls', vm_name + '.sls']

  try:
-    # Get socore user and group IDs
    socore_uid = pwd.getpwnam('socore').pw_uid
    socore_gid = grp.getgrnam('socore').gr_gid
+    pillar_root_real = os.path.realpath(PILLAR_ROOT)

    for f in pillar_files:
-      full_path = pillar_root + f
-      if not os.path.exists(full_path):
-        # Create empty file
-        os.mknod(full_path)
-        # Set ownership to socore:socore
-        os.chown(full_path, socore_uid, socore_gid)
-        # Set mode to 644 (rw-r--r--)
-        os.chmod(full_path, 0o640)
-        logging.error("createEmptyPillar reactor: created %s with socore:socore ownership and mode 644" % f)
+      full_path = os.path.join(PILLAR_ROOT, f)
+      resolved = os.path.realpath(full_path)
+      if os.path.dirname(resolved) != pillar_root_real:
+        log.error("createEmptyPillar reactor: refusing path outside pillar root: %s", resolved)
+        continue
+      if os.path.exists(resolved):
+        continue
+      os.mknod(resolved)
+      os.chown(resolved, socore_uid, socore_gid)
+      os.chmod(resolved, 0o640)
+      log.info("createEmptyPillar reactor: created %s with socore:socore ownership and mode 0640", f)

  except (KeyError, OSError) as e:
-    logging.error("createEmptyPillar reactor: Error setting ownership/permissions: %s" % str(e))
+    log.error("createEmptyPillar reactor: Error setting ownership/permissions: %s", e)

  return {}
@@ -1,18 +1,40 @@
+#!py
+
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.

-remove_key:
-  wheel.key.delete:
-    - args:
-      - match: {{ data['name'] }}
+import logging
+import re

-{{ data['name'] }}_pillar_clean:
-  runner.state.orchestrate:
-    - args:
-      - mods: orch.vm_pillar_clean
-      - pillar:
-          vm_name: {{ data['name'] }}
+log = logging.getLogger(__name__)

-{% do salt.log.info('deleteKey reactor: deleted minion key: %s' % data['name']) %}
+_VMNAME_RE = re.compile(r'^[A-Za-z0-9._-]{1,253}$')
+
+
+def run():
+  name = data.get('name', '')
+  if not _VMNAME_RE.match(str(name)):
+    log.error("deleteKey reactor: refusing unsafe name=%r", name)
+    return {}
+
+  log.info("deleteKey reactor: deleted minion key: %s", name)
+
+  return {
+    'remove_key': {
+      'wheel.key.delete': [
+        {'args': [
+          {'match': name},
+        ]},
+      ],
+    },
+    '%s_pillar_clean' % name: {
+      'runner.state.orchestrate': [
+        {'args': [
+          {'mods': 'orch.vm_pillar_clean'},
+          {'pillar': {'vm_name': name}},
+        ]},
+      ],
+    },
+  }
@@ -0,0 +1,240 @@
+# One pillar directory can map to multiple (state, tgt) actions.
+# tgt is a raw salt compound expression. tgt_type is always "compound".
+# Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s).
+# An action with `highstate: True` triggers state.highstate instead of
+# state.apply -- see salt/orch/push_batch.sls.
+#
+# Notes:
+#   - `bpf` is a pillar-only dir (no state of its own) consumed by both
+#     zeek and suricata via macros, so a bpf pillar change re-applies both.
+#   - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have
+#     their own pillar dirs AND their own state, so they map 1:1 (or 1:2
+#     in strelka's case, because of the split init.sls / manager.sls).
+#
+# Intentional omissions (these will log a "not in pillar_push_map.yaml"
+# warning in push_pillar.sls and wait for the next scheduled highstate):
+#   - `data` and `node_data`: pillar-only data consumed by many states;
+#     handling them generically would amount to a fleetwide highstate.
+#   - `host`: soc_host describes mainint/mainip; a change is a re-IP and
+#     needs a coordinated procedure, not an immediate state push.
+#   - `hypervisor`: state changes touch libvirt and are disruptive; leave
+#     to the next scheduled highstate.
+#   - `sensor`: every field in soc_sensor.yaml is `readonly: True` or
+#     per-minion (`node: True`). Per-minion edits are persisted under
+#     pillar/minions/<id>.sls and are handled by Branch A of push_pillar.sls
+#     (per-minion highstate intent), not by this app-pillar map.
+#
+# The role sets here were verified line-by-line against salt/top.sls. If
+# salt/top.sls changes how an app is targeted, update the corresponding
+# compound here.
+
+# firewall: the one pillar everyone touches. Applied everywhere intentionally
+# because every host's iptables needs to know about every other host in the
+# grid. Salt's firewall state is idempotent (file.managed + iptables-restore
+# onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is
+# unchanged do a file comparison and no-op without touching iptables -- actual
+# reload happens only on the hosts whose rules actually changed. Fleetwide
+# blast radius is intentional and matches the pre-plan behavior via highstate.
+# Adding N sensors in a burst coalesces into one dispatch via the drainer.
+firewall:
+  - state: firewall
+    tgt: '*'
+
+# backup: backup.config_backup runs on eval, standalone, manager, managerhype,
+# managersearch (NOT import -- the backup pillar is included on import per
+# pillar/top.sls but the backup state is not run there per salt/top.sls).
+backup:
+  - state: backup.config_backup
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# bpf is pillar-only (no state); consumed by both zeek and suricata as macros.
+# Both states run on sensor_roles + so-import per salt/top.sls.
+bpf:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# ca is applied universally.
+ca:
+  - state: ca
+    tgt: '*'
+
+# docker: universal. The docker state is in both the all-non-managers and
+# all-managers branches of salt/top.sls.
+docker:
+  - state: docker
+    tgt: '*'
+
+# elastalert: eval, standalone, manager, managerhype, managersearch (NOT import).
+elastalert:
+  - state: elastalert
+    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elastic-fleet-package-registry: manager_roles exactly.
+elastic-fleet-package-registry:
+  - state: elastic-fleet-package-registry
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# elasticsearch: 8 roles.
+elasticsearch:
+  - state: elasticsearch
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone'
+
+# elasticagent: so-heavynode only.
+elasticagent:
+  - state: elasticagent
+    tgt: 'G@role:so-heavynode'
+
+# elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid
+# is a deploy/enrollment step, not a config reload; leave it to the next highstate.
+elasticfleet:
+  - state: elasticfleet
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# global: fanout to a fleetwide highstate. The global pillar (soc_global.sls)
+# carries cross-cutting settings (pipeline, url_base, imagerepo, mdengine, ...)
+# that are consumed by virtually every state, so a targeted re-apply isn't
+# meaningful. The drainer's batch/batch_wait throttling controls blast radius.
+global:
+  - highstate: True
+    tgt: '*'
+
+# healthcheck: eval, sensor, standalone only.
+healthcheck:
+  - state: healthcheck
+    tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone'
+
+# hydra: manager_roles exactly.
+hydra:
+  - state: hydra
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# idh: so-idh only.
+idh:
+  - state: idh
+    tgt: 'G@role:so-idh'
+
+# influxdb: manager_roles exactly.
+influxdb:
+  - state: influxdb
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kafka: standalone, manager, managerhype, managersearch, searchnode, receiver.
+kafka:
+  - state: kafka
+    tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# kibana: manager_roles exactly.
+kibana:
+  - state: kibana
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# kratos: manager_roles exactly.
+kratos:
+  - state: kratos
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# logrotate: universal (top-of-file '*' branch in salt/top.sls).
+logrotate:
+  - state: logrotate
+    tgt: '*'
+
+# logstash: 8 roles, no eval/import.
+logstash:
+  - state: logstash
+    tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
+
+# manager: manager_roles exactly. The manager state is also referenced under
+# *_sensor / *_heavynode top.sls blocks via `sensor`, but the standalone
+# `manager` state itself runs only on manager_roles.
+manager:
+  - state: manager
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop.
+nginx:
+  - state: nginx
+    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# ntp: universal (top-of-file '*' branch in salt/top.sls).
+ntp:
+  - state: ntp
+    tgt: '*'
+
+# patch: universal. soc_patch carries the OS update schedule, applied via
+# patch.os.schedule on every node (it's in both the all-non-managers and
+# all-managers branches of salt/top.sls).
+patch:
+  - state: patch.os.schedule
+    tgt: '*'
+
+# postgres: manager_roles exactly.
+postgres:
+  - state: postgres
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver.
+# (NOT eval, NOT import, NOT searchnode.)
+redis:
+  - state: redis
+    tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone'
+
+# registry: manager_roles exactly.
+registry:
+  - state: registry
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# sensoroni: universal.
+sensoroni:
+  - state: sensoroni
+    tgt: '*'
+
+# soc: manager_roles exactly.
+soc:
+  - state: soc
+    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
+
+# stig: broad. Runs on standalone, manager, managerhype, managersearch,
+# searchnode, sensor, receiver, fleet, hypervisor, desktop.
+# NOT eval, NOT import, NOT heavynode, NOT idh (the *_idh block in
+# salt/top.sls intentionally omits stig).
+stig:
+  - state: stig
+    tgt: 'G@role:so-desktop or G@role:so-fleet or G@role:so-hypervisor or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
+
+# strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is
+# intentionally NOT fired on pillar changes -- YARA rule and strelka config
+# pillar changes are consumed by the sensor-side strelka backend, and re-running
+# strelka.manager on managers is both unnecessary and disruptive. strelka.manager
+# is left to the 2-hour highstate.
+strelka:
+  - state: strelka
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone'
+
+# suricata: sensor_roles + so-import (5 roles).
+suricata:
+  - state: suricata
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
+
+# telegraf: universal.
+telegraf:
+  - state: telegraf
+    tgt: '*'
+
+# versionlock: universal (top-of-file '*' branch in salt/top.sls).
+versionlock:
+  - state: versionlock
+    tgt: '*'
+
+# vm: libvirt-driver hypervisors only. Matched by the salt-cloud:driver:libvirt
+# grain (compound supports nested grain matching via G@<key>:<subkey>:<value>).
+# pillar/vm/soc_vm.sls write path is referenced at salt/_runners/setup_hypervisor.py:856.
+vm:
+  - state: vm
+    tgt: 'G@salt-cloud:driver:libvirt'
+
+# zeek: sensor_roles + so-import (5 roles).
+zeek:
+  - state: zeek
+    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
@@ -0,0 +1,176 @@
+#!py
+
+# Reactor invoked by the pillar_db beacon when SOC records settings changes in
+# the so_soc.audit_settings table (see salt/_beacons/pillar_db.py). The beacon
+# emits one event per new row carrying setting_id and node_id.
+#
+# Two branches, keyed on node_id:
+#   A) node_id populated -> the change is scoped to that one minion. Look up the
+#      app in pillar_push_map.yaml and write an intent that runs the app's mapped
+#      state(s) targeted to just that node.
+#   B) node_id empty -> grid-wide app change. Look up the app in
+#      pillar_push_map.yaml and write an intent with the entry's actions as-is.
+#
+# The app name is the first dotted segment of setting_id (e.g. "telegraf.output"
+# -> "telegraf"), which matches the pillar_push_map.yaml keys 1:1.
+#
+# Reactors never dispatch directly. The so-push-drainer schedule picks up
+# ready intents, dedupes across pending files, and dispatches orch.push_batch.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+import yaml
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the
+# master, which mounts the default saltstack tree at this path.
+PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml'
+
+_PUSH_MAP_CACHE = {'mtime': 0, 'data': None}
+
+
+def _load_push_map():
+    try:
+        st = os.stat(PUSH_MAP_PATH)
+    except OSError:
+        LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH)
+        return {}
+    if _PUSH_MAP_CACHE['mtime'] != st.st_mtime:
+        try:
+            with open(PUSH_MAP_PATH, 'r') as f:
+                _PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {}
+        except Exception:
+            LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH)
+            _PUSH_MAP_CACHE['data'] = {}
+        _PUSH_MAP_CACHE['mtime'] = st.st_mtime
+    return _PUSH_MAP_CACHE['data'] or {}
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_pillar: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_pillar: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def _app_from_setting(setting_id):
+    # setting_id is e.g. 'telegraf.output' -> 'telegraf', 'ntp.config.servers' -> 'ntp'
+    if not setting_id:
+        return None
+    return setting_id.split('.', 1)[0] or None
+
+
+def _node_actions(entry, node_id):
+    # Copy the app's mapped actions but retarget each one to the single node.
+    # Preserves the state/highstate selection and any batch/batch_wait overrides.
+    actions = []
+    for action in entry:
+        if not isinstance(action, dict):
+            continue
+        node_action = dict(action)
+        node_action['tgt'] = node_id
+        node_action['tgt_type'] = 'glob'
+        actions.append(node_action)
+    return actions
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_pillar: push disabled, skipping')
+        return {}
+
+    # The pillar_db beacon nests its payload under data['data']; fall back to the
+    # top level so the reactor is robust to either shape.
+    event = data.get('data', data)  # noqa: F821 -- data provided by reactor
+    setting_id = event.get('setting_id', '')
+    node_id = (event.get('node_id') or '').strip()
+
+    app = _app_from_setting(setting_id)
+    if not app:
+        LOG.debug('push_pillar: ignoring event with no app segment: setting_id=%s', setting_id)
+        return {}
+
+    push_map = _load_push_map()
+    entry = push_map.get(app)
+    if not entry:
+        LOG.warning(
+            'push_pillar: app "%s" is not in pillar_push_map.yaml; change will be '
+            'picked up at the next scheduled highstate (setting_id=%s)',
+            app, setting_id,
+        )
+        return {}
+
+    # Branch A: per-node change -> retarget the app's states to just that node.
+    if node_id:
+        actions = _node_actions(entry, node_id)
+        if not actions:
+            LOG.warning('push_pillar: no usable actions for app "%s" (setting_id=%s)', app, setting_id)
+            return {}
+        _write_intent(
+            'node_{}_{}'.format(node_id, app), actions,
+            'audit:{}@{}'.format(setting_id, node_id),
+        )
+        LOG.info('push_pillar: per-node intent updated for %s on %s (setting_id=%s)',
+                 app, node_id, setting_id)
+        return {}
+
+    # Branch B: grid-wide app change -> use the map entry's actions as-is.
+    actions = list(entry)  # copy to avoid mutating the cache
+    _write_intent('pillar_{}'.format(app), actions, 'audit:{}'.format(setting_id))
+    LOG.info('push_pillar: app intent updated for %s (setting_id=%s)', app, setting_id)
+    return {}
@@ -0,0 +1,96 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/strelka/rules/compiled/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side
+# strelka runs on exactly these four roles; so-import gets strelka.manager
+# instead, which is not fired on pillar changes.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES)
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_strelka: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_strelka: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_strelka: push disabled, skipping')
+        return {}
+
+    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'strelka', 'tgt': _sensor_compound()}]
+    _write_intent('rules_strelka', actions, path)
+    LOG.info('push_strelka: intent updated for path=%s', path)
+    return {}
@@ -0,0 +1,95 @@
+#!py
+
+# Reactor invoked by the inotify beacon on rule file changes under
+# /opt/so/saltstack/local/salt/suricata/rules/.
+#
+# Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json
+# and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
+# across pending files, and dispatches orch.push_batch. Reactors never dispatch
+# directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
+
+import fcntl
+import json
+import logging
+import os
+import time
+
+from salt.client import Caller
+
+LOG = logging.getLogger(__name__)
+
+PENDING_DIR = '/opt/so/state/push_pending'
+LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
+MAX_PATHS = 20
+
+# Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also
+# runs on so-import per salt/top.sls, so that role is appended below.
+SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
+
+
+def _sensor_compound_plus_import():
+    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import'
+
+
+def _push_enabled():
+    try:
+        caller = Caller()
+        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
+    except Exception:
+        LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled')
+        return True
+
+
+def _write_intent(key, actions, path):
+    now = time.time()
+    try:
+        os.makedirs(PENDING_DIR, exist_ok=True)
+    except OSError:
+        LOG.exception('push_suricata: cannot create %s', PENDING_DIR)
+        return
+
+    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
+    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
+    try:
+        fcntl.flock(lock_fd, fcntl.LOCK_EX)
+
+        intent = {}
+        if os.path.exists(intent_path):
+            try:
+                with open(intent_path, 'r') as f:
+                    intent = json.load(f)
+            except (IOError, ValueError):
+                intent = {}
+
+        intent.setdefault('first_touch', now)
+        intent['last_touch'] = now
+        intent['actions'] = actions
+        paths = intent.get('paths', [])
+        if path and path not in paths:
+            paths.append(path)
+            paths = paths[-MAX_PATHS:]
+        intent['paths'] = paths
+
+        tmp_path = intent_path + '.tmp'
+        with open(tmp_path, 'w') as f:
+            json.dump(intent, f)
+        os.rename(tmp_path, intent_path)
+    except Exception:
+        LOG.exception('push_suricata: failed to write intent %s', intent_path)
+    finally:
+        try:
+            fcntl.flock(lock_fd, fcntl.LOCK_UN)
+        finally:
+            os.close(lock_fd)
+
+
+def run():
+    if not _push_enabled():
+        LOG.info('push_suricata: push disabled, skipping')
+        return {}
+
+    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
+    actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}]
+    _write_intent('rules_suricata', actions, path)
+    LOG.info('push_suricata: intent updated for path=%s', path)
+    return {}
@@ -17,6 +17,7 @@ include:
 so-redis:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
+    - restart_policy: unless-stopped
    - hostname: so-redis
    - user: socore
    - networks:
@@ -16,11 +16,14 @@ include:
 # Install the registry container
 so-dockerregistry:
  docker_container.running:
-    - image: ghcr.io/security-onion-solutions/registry:3.0.0
+    - image: ghcr.io/security-onion-solutions/registry:3.1.1
    - hostname: so-registry
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-dockerregistry'].ip }}
+    # Intentionally `always` (not unless-stopped) -- registry is critical infra
+    # and must come back up even if it was manually stopped. Do not homogenize
+    # to unless-stopped; see the container auto-restart section of the plan.
    - restart_policy: always
    - port_bindings:
      {% for BINDING in DOCKERMERGED.containers['so-dockerregistry'].port_bindings %}
@@ -3,7 +3,7 @@
 {% set SCHEDULE = salt['pillar.get']('healthcheck:schedule', 30) %}

 include:
-  - salt
+  - salt.minion

 {% if CHECKS and ENABLED %}
 salt_beacons:
@@ -14,12 +14,13 @@ salt_beacons:
    - defaults:
        CHECKS: {{ CHECKS }}
        SCHEDULE: {{ SCHEDULE }}
-    - watch_in: 
+    - watch_in:
      - service: salt_minion_service
 {% else %}
 salt_beacons:
  file.absent:
    - name: /etc/salt/minion.d/beacons.conf
-    - watch_in: 
+    - watch_in:
      - service: salt_minion_service
 {% endif %}
+
@@ -0,0 +1,11 @@
+reactor:
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules':
+    - salt://reactor/push_suricata.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules/*':
+    - salt://reactor/push_suricata.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled':
+    - salt://reactor/push_strelka.sls
+  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled/*':
+    - salt://reactor/push_strelka.sls
+  - 'salt/beacon/*/pillar_db/audit_settings':
+    - salt://reactor/push_pillar.sls
@@ -5,3 +5,11 @@ salt_bootstrap:
    - source: salt://salt/scripts/bootstrap-salt.sh
    - mode: 755
    - show_changes: False
+
+salt_sbin:
+  file.recurse:
+    - name: /usr/sbin
+    - source: salt://salt/tools/sbin
+    - user: 939
+    - group: 939
+    - file_mode: 755
@@ -1,4 +1,4 @@
 lasthighstate:
  file.touch:
    - name: /opt/so/log/salt/lasthighstate
-    - order: last
+    - order: 9001
@@ -10,10 +10,13 @@
 #    software that is protected by the license key."

 {% from 'allowed_states.map.jinja' import allowed_states %}
+{% from 'global/map.jinja' import GLOBALMERGED %}
 {% if sls in allowed_states %}

 include:
  - salt.minion
+  - salt.master.pyinotify
+  - salt.master.boot_mine_update
 {%   if 'vrt' in salt['pillar.get']('features', []) %}
  - salt.cloud
  - salt.cloud.reactor_config_hypervisor
@@ -62,6 +65,21 @@ engines_config:
    - name: /etc/salt/master.d/engines.conf
    - source: salt://salt/files/engines.conf

+{% if GLOBALMERGED.push.enabled %}
+reactor_pushstate_config:
+  file.managed:
+    - name: /etc/salt/master.d/reactor_pushstate.conf
+    - source: salt://salt/files/reactor_pushstate.conf
+    - watch_in:
+      - service: salt_master_service
+{% else %}
+reactor_pushstate_config:
+  file.absent:
+    - name: /etc/salt/master.d/reactor_pushstate.conf
+    - watch_in:
+      - service: salt_master_service
+{% endif %}
+
 # update the bootstrap script when used for salt-cloud
 salt_bootstrap_cloud:
  file.managed:
@@ -77,7 +95,7 @@ salt_master_service:
      - file: checkmine_engine
      - file: pillarWatch_engine
      - file: engines_config
-    - order: last
+    - order: 9002

 {% else %}

@@ -0,0 +1,29 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Manages /etc/systemd/system/so-boot-mine-update.service, a manager-only
+# Type=oneshot unit that pushes `salt '*' mine.update` once per boot, ordered
+# before so-boot-highstate.service so mine-backed pillars (node IPs, ES/Redis/
+# Logstash discovery) are fresh before the boot highstate renders them.
+
+include:
+  - systemd.reload
+
+so_boot_mine_update_unit_file:
+  file.managed:
+    - name: /etc/systemd/system/so-boot-mine-update.service
+    - source: salt://salt/service/so-boot-mine-update.service
+    - onchanges_in:
+      - module: systemd_reload
+
+# Only enable once setup is complete. Until then the gate file is missing and
+# the unit's own ConditionPathExists would no-op it anyway.
+so_boot_mine_update_service:
+  service.enabled:
+    - name: so-boot-mine-update.service
+    - onlyif: test -e /opt/so/state/setup-complete
+    - require:
+      - file: so_boot_mine_update_unit_file
+      - module: systemd_reload
@@ -0,0 +1,20 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+pyinotify_module_package:
+  file.recurse:
+    - name: /opt/so/conf/salt/module_packages/pyinotify
+    - source: salt://salt/module_packages/pyinotify
+    - clean: True
+    - makedirs: True
+
+pyinotify_python_module_install:
+  cmd.run:
+    - name: /opt/saltstack/salt/bin/python3.10 -m pip install pyinotify --no-index --find-links=/opt/so/conf/salt/module_packages/pyinotify/ --upgrade
+    - onchanges:
+      - file: pyinotify_module_package
+    - failhard: True
+    - watch_in:
+      - service: salt_minion_service
@@ -2,4 +2,3 @@
 salt:
  minion:
    version: '3006.19'
-    check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
@@ -0,0 +1,31 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Manages /etc/systemd/system/so-boot-highstate.service, a Type=oneshot
+# RemainAfterExit=yes unit that runs `salt-call state.highstate` exactly once
+# per system boot. Replaces the legacy `startup_states: highstate` minion
+# config, which fired on every salt-minion service restart (causing a redundant
+# highstate whenever a highstate itself restarted salt-minion).
+
+include:
+  - systemd.reload
+
+so_boot_highstate_unit_file:
+  file.managed:
+    - name: /etc/systemd/system/so-boot-highstate.service
+    - source: salt://salt/service/so-boot-highstate.service
+    - onchanges_in:
+      - module: systemd_reload
+
+# Only enable once setup is complete. Until then the gate file is missing and
+# the unit's own ConditionPathExists would no-op it anyway -- this just keeps
+# `systemctl is-enabled` honest for the sync_es_users gate.
+so_boot_highstate_service:
+  service.enabled:
+    - name: so-boot-highstate.service
+    - onlyif: test -e /opt/so/state/setup-complete
+    - require:
+      - file: so_boot_highstate_unit_file
+      - module: systemd_reload
@@ -17,6 +17,7 @@ include:
  - repo.client
  - salt.mine_functions
  - salt.minion.service_file
+  - salt.minion.boot_highstate
 {% if GLOBALS.is_manager %}
  - ca.signing_policy
 {% endif %}
@@ -80,21 +81,47 @@ set_log_levels:
      - "log_level: info"
      - "log_level_logfile: info"

-enable_startup_states:
-  file.uncomment:
+# startup_states: highstate caused a full highstate to run on every
+# salt-minion service start, including the restart triggered when a highstate
+# itself modified the minion config (beacons, mine, unit file). Replaced by
+# so-boot-highstate.service (managed in salt.minion.boot_highstate), which
+# runs once per system boot only. Strip the line from /etc/salt/minion on
+# upgrade; both the commented and uncommented forms historically existed.
+remove_startup_states:
+  file.line:
    - name: /etc/salt/minion
-    - regex: '^startup_states: highstate$'
-    - unless: pgrep so-setup
+    - match: 'startup_states: highstate'
+    - mode: delete
+
+# Upgrade-path bridge: systems that already passed setup under the old gate
+# (`grep -x 'startup_states: highstate' /etc/salt/minion`) get a /opt/so/state/setup-complete
+# marker so so-boot-highstate.service can be enabled and the so-user_sync cron
+# in sync_es_users.sls keeps installing. Setup-in-progress systems instead get
+# the marker from `mark_setup_complete` in setup/so-functions at the right
+# moment. `replace: false` means we never overwrite a marker once written.
+mark_setup_complete_for_upgrades:
+  file.managed:
+    - name: /opt/so/state/setup-complete
+    - replace: false
+    - makedirs: True
+    - onlyif: "grep -qx 'startup_states: highstate' /etc/salt/minion"
+    - require_in:
+      - file: remove_startup_states
+      - service: so_boot_highstate_service

 {% endif %}

-# this has to be outside the if statement above since there are <requisite>_in calls to this state
+# this has to be outside the if statement above since there are <requisite>_in calls to this state.
+# uses watch (not listen) so the restart fires in-state and its result lands on this state's
+# running entry; that is what lets wait_for_salt_minion_ready below detect any restart
+# uniformly via onchanges, regardless of whether the trigger came from these files or from
+# external watch_in's (e.g. beacons, master/pyinotify).
 salt_minion_service:
  service.running:
    - name: salt-minion
    - enable: True
    - onlyif: test "{{INSTALLEDSALTVERSION}}" == "{{SALTVERSION}}"
-    - listen:
+    - watch:
      - file: mine_functions
 {% if INSTALLEDSALTVERSION|string == SALTVERSION|string %}
      - file: set_log_levels
@@ -103,3 +130,17 @@ salt_minion_service:
      - file: signing_policy
 {% endif %}
    - order: last
+
+# block until the just-restarted salt-minion is back and can execute modules locally, so
+# follow-on jobs and the next highstate iteration do not race the restart. onchanges +
+# require on salt_minion_service catches every restart trigger uniformly because watch
+# mod_watch results replace the service state's running entry. wait logic lives in
+# /usr/sbin/so-salt-minion-wait (deployed by common_sbin from common/tools/sbin/).
+wait_for_salt_minion_ready:
+  cmd.run:
+    - name: /usr/sbin/so-salt-minion-wait
+    - onchanges:
+      - service: salt_minion_service
+    - require:
+      - service: salt_minion_service
+    - order: last
@@ -0,0 +1,14 @@
+[Unit]
+Description=Security Onion boot-time highstate (runs once per boot)
+After=salt-minion.service network-online.target docker.service
+Wants=network-online.target docker.service
+Requires=salt-minion.service
+ConditionPathExists=/opt/so/state/setup-complete
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/usr/bin/salt-call state.highstate -l info queue=True
+
+[Install]
+WantedBy=multi-user.target
@@ -0,0 +1,15 @@
+[Unit]
+Description=Security Onion boot-time grid mine.update (managers, runs once per boot before highstate)
+After=salt-master.service salt-minion.service network-online.target
+Wants=network-online.target
+Requires=salt-master.service salt-minion.service
+Before=so-boot-highstate.service
+ConditionPathExists=/opt/so/state/setup-complete
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/usr/sbin/so-boot-mine-update
+
+[Install]
+WantedBy=multi-user.target
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Block until the local salt-minion service is back up and can execute modules locally.
+# Invoked from the wait_for_salt_minion_ready state in salt/minion/init.sls after
+# salt_minion_service fires its watch-driven mod_watch (a non-blocking systemctl restart),
+# so follow-on jobs and the next highstate iteration do not race the in-flight restart.
+
+. /usr/sbin/so-common
+
+# Initial sleep gives the systemctl restart (--no-block by default for salt-minion on
+# >=3006.15) time to begin tearing down the old process before we probe for readiness.
+INITIAL_SLEEP=3
+TIMEOUT=120
+PING_TIMEOUT=5
+
+sleep "$INITIAL_SLEEP"
+
+elapsed="$INITIAL_SLEEP"
+while [ "$elapsed" -lt "$TIMEOUT" ]; do
+  if systemctl is-active --quiet salt-minion \
+     && salt-call --local --timeout="$PING_TIMEOUT" --out=quiet test.ping >/dev/null 2>&1; then
+    echo "salt-minion ready after ${elapsed}s"
+    exit 0
+  fi
+  sleep 1
+  elapsed=$((elapsed + 1))
+done
+
+echo "salt-minion did not become ready within ${TIMEOUT}s" >&2
+exit 1
--- a/Show More
+++ b/Show More
@@ -1 +1 @@
 .1.0
 .2.0