Merge remote-tracking branch 'origin/3/dev' into saltthangs

move so-salt-minion-wait
Replace inotify pillar watch with postgres audit_settings beacon
2026-06-12 21:29:16 +02:00 · 2026-06-10 08:56:17 -04:00 · 2026-06-01 14:48:54 -04:00 · 2026-05-29 14:55:13 -04:00 · 2026-05-28 14:01:42 -04:00 · 2026-05-27 08:59:28 -04:00
60 changed files with 1595 additions and 381 deletions
@@ -0,0 +1,142 @@
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 # Custom salt beacon that watches the SOC audit_settings table in postgres for
 # new settings changes and emits a beacon event per new row. This replaces the
 # inotify watch on /opt/so/saltstack/local/pillar -- instead of monitoring pillar
 # files on disk, we monitor the so_soc.audit_settings table that SOC writes to.
 #
 # Detection is poll-based with a monotonic `id` watermark persisted to
 # WATERMARK_FILE: each pass selects rows with id greater than the last id seen,
 # which makes it self-healing (a missed poll simply catches up on the next one).
 #
 # Each emitted event carries setting_id and node_id; the push_pillar reactor maps
 # setting_id -> app via pillar_push_map.yaml and writes a push intent, after which
 # the existing so-push-drainer / orch.push_batch pipeline takes over unchanged.
 import logging
 import os
 import subprocess
 log = logging.getLogger(__name__)
 WATERMARK_FILE = '/opt/so/state/pillar_db_watch.id'
 CONTAINER = 'so-postgres'
 DATABASE = 'so_soc'
 # Unaligned, tuples-only psql output with a field separator that cannot appear in
 # an id/setting_id/node_id, so we can split each row reliably.
 FIELD_SEP = '\x1f'
 def __virtual__():
    return True
 def validate(config):
    return True, 'valid'
 def _read_watermark():
    # Returns the last processed id, or None if the watermark has not been seeded.
    try:
        with open(WATERMARK_FILE, 'r') as f:
            return int((f.read() or '').strip())
    except (IOError, ValueError):
        return None
 def _write_watermark(value):
    try:
        os.makedirs(os.path.dirname(WATERMARK_FILE), exist_ok=True)
        tmp = WATERMARK_FILE + '.tmp'
        with open(tmp, 'w') as f:
            f.write(str(int(value)))
        os.rename(tmp, WATERMARK_FILE)
    except OSError:
        log.exception('pillar_db beacon: failed to persist watermark to %s', WATERMARK_FILE)
 def _query(sql):
    # Run a query against so_soc inside the so-postgres container over the unix
    # socket (trust auth, no password). Returns stdout on success, or None on any
    # failure so the caller can no-op and retry on the next interval.
    cmd = [
        'docker', 'exec', CONTAINER,
        'psql', '-U', 'postgres', '-d', DATABASE,
        '-tA', '-F', FIELD_SEP, '-c', sql,
    ]
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
    except subprocess.TimeoutExpired:
        log.warning('pillar_db beacon: psql timed out')
        return None
    except Exception:
        log.exception('pillar_db beacon: failed to exec psql')
        return None
    if result.returncode != 0:
        log.warning('pillar_db beacon: psql failed (rc=%s): %s',
                    result.returncode, (result.stderr or '').strip())
        return None
    return result.stdout
 def beacon(config):
    retval = []
    watermark = _read_watermark()
    # First run / missing watermark: seed to the current MAX(id) and emit nothing
    # so we never replay the entire settings history into a fleetwide push.
    if watermark is None:
        seed = _query('SELECT COALESCE(MAX(id), 0) FROM audit_settings;')
        if seed is None:
            return retval  # postgres not ready yet; retry next interval
        try:
            _write_watermark(int((seed or '0').strip() or 0))
        except ValueError:
            log.warning('pillar_db beacon: could not parse MAX(id) seed: %r', seed)
        return retval
    rows = _query(
        "SELECT id, setting_id, COALESCE(node_id, '') FROM audit_settings "
        "WHERE id > %d ORDER BY id;" % watermark
    )
    if rows is None:
        return retval
    max_id = watermark
    for line in rows.splitlines():
        # Do NOT str.strip() the whole line: Python treats the \x1f field
        # separator (and \x1c-\x1e) as whitespace, so stripping would eat an
        # empty trailing node_id field and make the row look malformed.
        if not line.strip():
            continue
        parts = line.split(FIELD_SEP)
        if len(parts) < 3:
            log.warning('pillar_db beacon: skipping malformed row: %r', line)
            continue
        try:
            row_id = int(parts[0])
        except ValueError:
            log.warning('pillar_db beacon: skipping row with non-int id: %r', line)
            continue
        setting_id = parts[1]
        node_id = parts[2]
        retval.append({
            'tag': 'audit_settings',
            'id': row_id,
            'setting_id': setting_id,
            'node_id': node_id,
        })
        if row_id > max_id:
            max_id = row_id
    if max_id > watermark:
        _write_watermark(max_id)
        log.info('pillar_db beacon: emitted %d change(s), watermark %d -> %d',
                 len(retval), watermark, max_id)
    return retval
@@ -1,5 +1,3 @@
 {% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%}
 #!/bin/bash
 #
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
@@ -25,7 +23,8 @@ SYSTEM_START_TIME=$(date -d "$(</proc/uptime awk '{print $1}') seconds ago" +%s)
 LAST_HIGHSTATE_END=$([ -e "/opt/so/log/salt/lasthighstate" ] && date -r /opt/so/log/salt/lasthighstate +%s || echo 0)
 LAST_HEALTHCHECK_STATE_APPLY=$([ -e "/opt/so/log/salt/state-apply-test" ] && date -r /opt/so/log/salt/state-apply-test +%s || echo 0)
 # SETTING THRESHOLD TO ANYTHING UNDER 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
-THRESHOLD={{SALT_MINION_DEFAULTS.salt.minion.check_threshold}} #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
+# THRESHOLD is derived from the global push highstate interval + 1 hour, so the minion-check grace period tracks the schedule automatically.
 THRESHOLD=$(( ({{ salt['pillar.get']('global:push:highstate_interval_hours', 2) }} + 1) * 3600 )) #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
 THRESHOLD_DATE=$((LAST_HEALTHCHECK_STATE_APPLY+THRESHOLD))
 logCmd() {
@@ -9,7 +9,8 @@
 prune_images:
  cmd.run:
    - name: so-docker-prune
-    - order: last
+    - onlyif: command -v /usr/sbin/so-docker-prune >/dev/null 2>&1
    - order: 9000
 {% else %}
@@ -19,6 +19,7 @@ wait_for_elasticsearch:
 so-elastalert:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastalert:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: elastalert
    - name: so-elastalert
    - user: so-elastalert
@@ -15,6 +15,7 @@ include:
 so-elastic-fleet-package-registry:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-fleet-package-registry:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-elastic-fleet-package-registry
    - hostname: Fleet-package-reg-{{ GLOBALS.hostname }}
    - detach: True
@@ -16,6 +16,7 @@ include:
 so-elastic-agent:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-elastic-agent
    - hostname: {{ GLOBALS.hostname }}
    - detach: True
@@ -42,6 +42,7 @@ elasticagent_syncartifacts:
 so-elastic-fleet:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elastic-agent:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-elastic-fleet
    - hostname: FleetServer-{{ GLOBALS.hostname }}
    - detach: True
@@ -30,70 +30,6 @@ fleet_api() {
    curl -sK /opt/so/conf/elasticsearch/curl.config -L "localhost:5601/api/fleet/${QUERYPATH}" "$@" --retry 3 --retry-delay 10 --fail 2>/dev/null
 }
 # Max number of concurrent Fleet write jobs (create/update). Override via env if needed.
 MAX_FLEET_JOBS=${MAX_FLEET_JOBS:-10}
 # Block until fewer than MAX_FLEET_JOBS background jobs are running.
 elastic_fleet_throttle() {
    while (( $(jobs -rp | wc -l) >= MAX_FLEET_JOBS )); do
        wait -n
    done
 }
 # Load every integration JSON in a directory into a single agent policy.
 # The agent policy is fetched ONCE (not per file), and the create/update writes
 # are dispatched as throttled background jobs.
 #   $1 AGENT_POLICY     - the agent policy id/name to load integrations into
 #   $2 DIR              - directory of integration *.json files
 #   $3 LABEL           - human-readable label for log output
 #   $4 SKIP_CREATE_NAME - (optional) integration name to skip when creating (still updated if present)
 # Returns 1 if any integration failed to create/update.
 elastic_fleet_load_integrations_dir() {
    local AGENT_POLICY=$1
    local DIR=$2
    local LABEL=$3
    local SKIP_CREATE_NAME=$4
    local POLICY_JSON FAIL_FILE INTEGRATION NAME ID
    FAIL_FILE=$(mktemp)
    # Fetch the agent policy a single time; we look up integration ids locally below.
    POLICY_JSON=$(fleet_api "agent_policies/$AGENT_POLICY")
    for INTEGRATION in "$DIR"/*.json; do
        [ -e "$INTEGRATION" ] || continue
        NAME=$(jq -r .name "$INTEGRATION")
        ID=$(jq -r --arg n "$NAME" '.item.package_policies[]? | select(.name==$n) | .id' <<<"$POLICY_JSON")
        elastic_fleet_throttle
        {
            if [ -n "$ID" ]; then
                printf "\n\n%s - Updating integration %s\n" "$LABEL" "$NAME"
                if ! elastic_fleet_integration_update "$ID" "@$INTEGRATION"; then
                    flock 9; echo "update ${INTEGRATION##*/}" >&9
                fi
            elif [ -n "$SKIP_CREATE_NAME" ] && [ "$NAME" == "$SKIP_CREATE_NAME" ]; then
                printf "\n\n%s - Skipping creation of %s\n" "$LABEL" "$NAME"
            else
                printf "\n\n%s - Creating integration %s\n" "$LABEL" "$NAME"
                if ! elastic_fleet_integration_create "@$INTEGRATION"; then
                    flock 9; echo "create ${INTEGRATION##*/}" >&9
                fi
            fi
        } 9>>"$FAIL_FILE" &
    done
    wait
    local rc=0
    if [ -s "$FAIL_FILE" ]; then
        printf "\n%s: failed integrations:\n" "$LABEL"
        cat "$FAIL_FILE"
        rc=1
    fi
    rm -f "$FAIL_FILE"
    return $rc
 }
 elastic_fleet_integration_check() {
    AGENT_POLICY=$1
@@ -18,26 +18,93 @@ if [ ! -f /opt/so/state/eaintegrations.txt ]; then
  # Third, configure Elastic Defend Integration seperately
  /usr/sbin/so-elastic-fleet-integration-policy-elastic-defend
  # Each group fetches its agent policy once and dispatches create/update writes concurrently.
  # Initial Endpoints
-  elastic_fleet_load_integrations_dir "endpoints-initial" \
+  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/endpoints-initial/*.json; do
-    /opt/so/conf/elastic-fleet/integrations/endpoints-initial "Initial Endpoints Policy" || RETURN_CODE=1
+    printf "\n\nInitial Endpoints Policy - Loading $INTEGRATION\n"
    elastic_fleet_integration_check "endpoints-initial" "$INTEGRATION"
    if [ -n "$INTEGRATION_ID" ]; then
      printf "\n\nIntegration $NAME exists - Updating integration\n"
      if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
        echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
        RETURN_CODE=1
        continue
      fi
    else
      printf "\n\nIntegration does not exist - Creating integration\n"
      if ! elastic_fleet_integration_create "@$INTEGRATION"; then
        echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
        RETURN_CODE=1
        continue
      fi
    fi
  done
  # Grid Nodes - General
-  elastic_fleet_load_integrations_dir "so-grid-nodes_general" \
+  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/grid-nodes_general/*.json; do
-    /opt/so/conf/elastic-fleet/integrations/grid-nodes_general "Grid Nodes Policy_General" || RETURN_CODE=1
+    printf "\n\nGrid Nodes Policy_General - Loading $INTEGRATION\n"
    elastic_fleet_integration_check "so-grid-nodes_general" "$INTEGRATION"
    if [ -n "$INTEGRATION_ID" ]; then
      printf "\n\nIntegration $NAME exists - Updating integration\n"
      if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
        echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
        RETURN_CODE=1
        continue
      fi
    else
      printf "\n\nIntegration does not exist - Creating integration\n"
      if ! elastic_fleet_integration_create "@$INTEGRATION"; then
        echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
        RETURN_CODE=1
        continue
      fi
    fi
  done
  # Grid Nodes - Heavy
-  elastic_fleet_load_integrations_dir "so-grid-nodes_heavy" \
+  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/grid-nodes_heavy/*.json; do
-    /opt/so/conf/elastic-fleet/integrations/grid-nodes_heavy "Grid Nodes Policy_Heavy" || RETURN_CODE=1
+    printf "\n\nGrid Nodes Policy_Heavy - Loading $INTEGRATION\n"
    elastic_fleet_integration_check "so-grid-nodes_heavy" "$INTEGRATION"
    if [ -n "$INTEGRATION_ID" ]; then
      printf "\n\nIntegration $NAME exists - Updating integration\n"
      if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
        echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
        RETURN_CODE=1
        continue
      fi
    else
      printf "\n\nIntegration does not exist - Creating integration\n"
      if ! elastic_fleet_integration_create "@$INTEGRATION"; then
        echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
        RETURN_CODE=1
        continue
      fi
    fi
  done
-  # Fleet Server - Optional integrations (one agent policy per FleetServer_* directory)
+  # Fleet Server - Optional integrations
-  for FLEET_DIR in /opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/; do
+  for INTEGRATION in /opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/*.json; do
-    [ -d "$FLEET_DIR" ] || continue
+    if ! [ "$INTEGRATION" == "/opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/*.json" ]; then
-    FLEET_POLICY=$(basename "$FLEET_DIR")
+      FLEET_POLICY=`echo "$INTEGRATION"| cut -d'/' -f7`
-    elastic_fleet_load_integrations_dir "$FLEET_POLICY" \
+      printf "\n\nFleet Server Policy - Loading $INTEGRATION\n"
-      "${FLEET_DIR%/}" "Fleet Server Policy" "elasticsearch-logs" || RETURN_CODE=1
+      elastic_fleet_integration_check "$FLEET_POLICY" "$INTEGRATION"
      if [ -n "$INTEGRATION_ID" ]; then
        printf "\n\nIntegration $NAME exists - Updating integration\n"
        if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
          echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
          RETURN_CODE=1
          continue
        fi
      else
        printf "\n\nIntegration does not exist - Creating integration\n"
        if [ "$NAME" != "elasticsearch-logs" ]; then
          if ! elastic_fleet_integration_create "@$INTEGRATION"; then
            echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
            RETURN_CODE=1
            continue
          fi
        fi
      fi
    fi
  done
  # Only create the state file if all policies were created/updated successfully
@@ -23,90 +23,73 @@ if [ $? -ne 0 ]; then
 fi
 default_packages=({% for pkg in SUPPORTED_PACKAGES %}"{{ pkg }}"{% if not loop.last %} {% endif %}{% endfor %})
 # JSON array of the default packages, used by the jq filter below.
 default_packages_json=$(printf '%s\n' "${default_packages[@]}" | jq -R . | jq -s '.')
 # Output lock (serializes concurrent job output) and failure file (one marker line per
 # failed integration). Mirrors the pattern used by elastic_fleet_load_integrations_dir.
 OUTPUT_LOCK=$(mktemp)
 FAIL_FILE=$(mktemp)
 trap 'rm -f "$OUTPUT_LOCK" "$FAIL_FILE"' EXIT
 # Cache of package name -> latest available version, so the same package is only looked up
 # once instead of once per (policy, integration).
 declare -A LATEST_VERSION_CACHE
 ERROR=false
 for AGENT_POLICY in $agent_policies; do
-    # Fetch the agent policy a single time; package name/version and integration id are all
+    if ! integrations=$(elastic_fleet_integration_policy_names "$AGENT_POLICY"); then
    # extracted locally below instead of re-fetching the same policy per integration.
    if ! POLICY_JSON=$(fleet_api "agent_policies/$AGENT_POLICY"); then
        # this script upgrades default integration packages, exit 1 and let salt handle retrying
        exit 1
    fi
-
+    for INTEGRATION in $integrations; do
-    # One jq pass emits name/package.name/package.version/id for every eligible integration.
+        if ! [[ "$INTEGRATION" == "elastic-defend-endpoints" ]] && ! [[ "$INTEGRATION" == "fleet_server-"* ]]; then
-    # The endpoint/fleet_server skips and the default-package gate are applied here in jq.
+            # Get package name so we know what package to look for when checking the current and latest available version
-    # $defaults (not $def, a jq reserved keyword) holds the default package list.
+            if ! PACKAGE_NAME=$(elastic_fleet_integration_policy_package_name "$AGENT_POLICY" "$INTEGRATION"); then
    while IFS=$'\t' read -r INTEGRATION PACKAGE_NAME PACKAGE_VERSION INTEGRATION_ID; do
        [ -n "$INTEGRATION" ] || continue
        # Look up the latest available version once per package, then memoize it.
        if [[ -z "${LATEST_VERSION_CACHE[$PACKAGE_NAME]+set}" ]]; then
            if ! AVAILABLE_VERSION=$(elastic_fleet_package_latest_version_check "$PACKAGE_NAME"); then
                echo "Error: Failed getting latest version for $PACKAGE_NAME"
                exit 1
            fi
-            LATEST_VERSION_CACHE[$PACKAGE_NAME]=$AVAILABLE_VERSION
+            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
            if [[ " ${default_packages[@]} " =~ " $PACKAGE_NAME " ]]; then
            {%- endif %}
                # Get currently installed version of package
                attempt=0
                max_attempts=3
                while [ $attempt -lt $max_attempts ]; do
                    if PACKAGE_VERSION=$(elastic_fleet_integration_policy_package_version "$AGENT_POLICY" "$INTEGRATION") && AVAILABLE_VERSION=$(elastic_fleet_package_latest_version_check "$PACKAGE_NAME"); then
                        break
                    fi
                    attempt=$((attempt + 1))
                done
                if [ $attempt -eq $max_attempts ]; then
                    echo "Error: Failed getting $PACKAGE_VERSION or $AVAILABLE_VERSION"
                    exit 1
                fi
                # Get integration ID
                if ! INTEGRATION_ID=$(elastic_fleet_integration_id "$AGENT_POLICY" "$INTEGRATION"); then
                    exit 1
                fi
        AVAILABLE_VERSION=${LATEST_VERSION_CACHE[$PACKAGE_NAME]}
                if [[ "$PACKAGE_VERSION" != "$AVAILABLE_VERSION" ]]; then
-            # Dry run, then (if clean) the actual upgrade, dispatched as a throttled background
+                    # Dry run of the upgrade
-            # job. Each job builds its full log into one block, then flushes it under a single
+                    echo ""
-            # shared lock (OUTPUT_LOCK) so concurrent jobs never interleave on stdout; a failed
+                    echo "Current $PACKAGE_NAME package version ($PACKAGE_VERSION) is not the same as the latest available package ($AVAILABLE_VERSION)..."
-            # job also appends a marker line to FAIL_FILE while holding that same lock.
+                    echo "Upgrading $INTEGRATION..."
-            elastic_fleet_throttle
+                    echo "Starting dry run..."
            {
                block=$'\n'"Current $PACKAGE_NAME package version ($PACKAGE_VERSION) is not the same as the latest available package ($AVAILABLE_VERSION)..."$'\n'
                block+="Upgrading $INTEGRATION..."$'\n'"Starting dry run..."$'\n'
                fail=""
                    if ! DRYRUN_OUTPUT=$(elastic_fleet_integration_policy_dryrun_upgrade "$INTEGRATION_ID"); then
-                    block+="Error: Failed to complete dry run for '$INTEGRATION_ID'."$'\n'
+                        exit 1
-                    fail="dryrun $INTEGRATION"
+                    fi
-                elif [[ "$(jq .[].hasErrors <<<"$DRYRUN_OUTPUT")" == "false" ]]; then
+                    DRYRUN_ERRORS=$(echo "$DRYRUN_OUTPUT" | jq .[].hasErrors)
-                    block+="No errors detected. Proceeding with upgrade..."$'\n'
+
                    # If no errors with dry run, proceed with actual upgrade
                    if [[ "$DRYRUN_ERRORS" == "false" ]]; then
                        echo "No errors detected. Proceeding with upgrade..."
                        if ! elastic_fleet_integration_policy_upgrade "$INTEGRATION_ID"; then
-                        block+="Error: Upgrade failed for $PACKAGE_NAME with integration ID '$INTEGRATION_ID'."$'\n'
+                            echo "Error: Upgrade failed for $PACKAGE_NAME with integration ID '$INTEGRATION_ID'."
-                        fail="upgrade $INTEGRATION"
+                            ERROR=true
                            continue
                        fi
                    else
-                    block+="Errors detected during dry run for $PACKAGE_NAME policy upgrade..."$'\n'
+                        echo "Errors detected during dry run for $PACKAGE_NAME policy upgrade..."
-                    fail="dryrun-errors $INTEGRATION"
+                        ERROR=true
                        continue
                    fi
                {
                    flock 9
                    printf '%s' "$block"
                    [ -n "$fail" ] && printf '%s\n' "$fail" >>"$FAIL_FILE"
                } 9>>"$OUTPUT_LOCK"
            } &
                fi
    done < <(jq -r --argjson defaults "$default_packages_json" '
        .item.package_policies[]
        | select(.name != "elastic-defend-endpoints")
        | select(.name | startswith("fleet_server-") | not)
            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
-        | select(.package.name | IN($defaults[]))
+            fi
            {%- endif %}
-        | [.name, .package.name, .package.version, .id] | @tsv
+        fi
-    ' <<<"$POLICY_JSON")
+    done
 done
-
+if [[ "$ERROR" == "true" ]]; then
 # Barrier: wait for every dispatched dry-run/upgrade job to finish.
 wait
 if [ -s "$FAIL_FILE" ]; then
    printf '\nFailed integration upgrades:\n'
    cat "$FAIL_FILE"
    exit 1
 fi
 echo
@@ -16,6 +16,7 @@
 STATE_FILE_SUCCESS=/opt/so/state/estemplates.txt
 INSTALLED_PACKAGE_LIST=/tmp/esfleet_installed_packages.json
 BULK_INSTALL_PACKAGE_LIST=/tmp/esfleet_bulk_install.json
 BULK_INSTALL_PACKAGE_TMP=/tmp/esfleet_bulk_install_tmp.json
 BULK_INSTALL_OUTPUT=/opt/so/state/esfleet_bulk_install_results.json
 INTEGRATION_PACKAGE_COMPONENTS=/opt/so/state/esfleet_package_components.json
 INPUT_PACKAGE_COMPONENTS=/opt/so/state/esfleet_input_package_components.json
@@ -28,6 +29,29 @@ PENDING_UPDATE=false
 #   Requiring some level of manual Elastic Stack configuration before installation
 EXCLUDED_INTEGRATIONS=('apm')
 version_conversion(){
    version=$1
    echo "$version" | awk -F '.' '{ printf("%d%03d%03d\n", $1, $2, $3); }'
 }
 compare_versions() {
    version1=$1
    version2=$2
    # Convert versions to numbers
    num1=$(version_conversion "$version1")
    num2=$(version_conversion "$version2")
    # Compare using bc
    if (( $(echo "$num1 < $num2" | bc -l) )); then
        echo "less"
    elif (( $(echo "$num1 > $num2" | bc -l) )); then
        echo "greater"
    else
        echo "equal"
    fi
 }
 IFS=$'\n'
 agent_policies=$(elastic_fleet_agent_policy_ids)
 if [ $? -ne 0 ]; then
@@ -39,23 +63,23 @@ default_packages=({% for pkg in SUPPORTED_PACKAGES %}"{{ pkg }}"{% if not loop.l
 in_use_integrations=()
 # Fetch each agent policy once; its package_policies[] already contain both the integration name
 #  and the .package.name, so extract all non-default package names locally in a single jq instead
 #  of re-fetching the same policy per integration.
 default_packages_json=$(printf '%s\n' "${default_packages[@]}" | jq -R . | jq -s '.')
 for AGENT_POLICY in $agent_policies; do
-    if ! policy_json=$(fleet_api "agent_policies/$AGENT_POLICY"); then
+    if ! integrations=$(elastic_fleet_integration_policy_names "$AGENT_POLICY"); then
        # skip the agent policy if we can't get required info, let salt retry. Integrations loaded by this script are non-default integrations.
        echo "Skipping $AGENT_POLICY.. "
        continue
    fi
    for INTEGRATION in $integrations; do
        if ! PACKAGE_NAME=$(elastic_fleet_integration_policy_package_name "$AGENT_POLICY" "$INTEGRATION"); then
            echo  "Not adding $INTEGRATION, couldn't get package name"
            continue
        fi
        # non-default integrations that are in-use in any policy
-    while IFS= read -r PACKAGE_NAME; do
+        if ! [[ " ${default_packages[@]} " =~ " $PACKAGE_NAME " ]]; then
-        [ -n "$PACKAGE_NAME" ] && in_use_integrations+=("$PACKAGE_NAME")
+            in_use_integrations+=("$PACKAGE_NAME")
-    done < <(jq -r --argjson defaults "$default_packages_json" \
+        fi
-        '.item.package_policies[].package.name | select(. as $n | ($defaults | index($n)) | not)' \
+    done
        <<<"$policy_json")
 done
 if [[ -f $STATE_FILE_SUCCESS  ]]; then
@@ -66,55 +90,72 @@ if [[ -f $STATE_FILE_SUCCESS  ]]; then
        rm -f $INSTALLED_PACKAGE_LIST
        echo $latest_package_list | jq '{packages: [.items[] | {name: .name, latest_version: .version, installed_version: .installationInfo.version, subscription: .conditions.elastic.subscription }]}' >> $INSTALLED_PACKAGE_LIST
-        # Build the bulk install list and the per-package status messages with two jq passes
+        while read -r package; do
-        #  instead of a per-package bash loop. The old loop forked ~10 processes per package
+            # get package details
-        #  (5 jq + awk/bc for the version compare) and re-parsed/rewrote a growing JSON file on
+            package_name=$(echo "$package" | jq -r '.name')
-        #  every add (O(n^2)). Selection and messages below are identical to that logic.
+            latest_version=$(echo "$package" | jq -r '.latest_version')
-        SUB={% if SUB %}true{% else %}false{% endif %}
+            installed_version=$(echo "$package" | jq -r '.installed_version')
-        AUTOUP={% if AUTO_UPGRADE_INTEGRATIONS %}true{% else %}false{% endif %}
+            subscription=$(echo "$package" | jq -r '.subscription')
-        EXCLUDED_JSON=$(printf '%s\n' "${EXCLUDED_INTEGRATIONS[@]}" | jq -R 'select(length>0)' | jq -s '.')
+            bulk_package=$(echo "$package" | jq '{name: .name, version: .latest_version}' )
        INUSE_JSON=$(printf '%s\n' "${in_use_integrations[@]}" | jq -R 'select(length>0)' | jq -s 'unique')
-        # vnum replicates the previous version_conversion (%d%03d%03d of the first three dotted
+            if [[ ! "${EXCLUDED_INTEGRATIONS[@]}" =~ "$package_name" ]]; then
-        #  fields); needs() replicates the excluded/subscription/installed/upgrade/in-use logic.
+            {% if not SUB %}
-        JQ_DECISION='
+                if [[ "$subscription" != "basic" && "$subscription" != "null" && -n "$subscription" ]]; then
-def vnum:
+                    # pass over integrations that require non-basic elastic license
-  [ (split(".")|.[0:3][] | gsub("[^0-9].*";"") | (if .=="" then "0" else . end) | tonumber) ]
+                    echo "$package_name integration requires an Elastic license of $subscription or greater... skipping"
-  | (.[0]//0)*1000000 + (.[1]//0)*1000 + (.[2]//0);
+                    continue
-def needs($sub;$autoup;$excluded;$inuse):
+                else
-  .name as $n
+                    if [[ "$installed_version" == "null" || -z "$installed_version" ]]; then
-  | ($n | IN($excluded[]) | not)
+                        echo "$package_name is not installed... Adding to next update."
-  and ( $sub or (.subscription==null or .subscription=="basic" or .subscription=="") )
+                        jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
  and ( (.installed_version==null or .installed_version=="")
        or ( ((.latest_version|vnum) > (.installed_version|vnum))
             and ( $autoup or ($n | IN($inuse[]) | not) ) ) );'
        JQ_ARGS=(--argjson sub "$SUB" --argjson autoup "$AUTOUP" --argjson excluded "$EXCLUDED_JSON" --argjson inuse "$INUSE_JSON")
        # (a) Per-package status messages (parity with the previous echo output).
        jq -r "${JQ_ARGS[@]}" "$JQ_DECISION"'
          .packages[]
          | .name as $n
          | if ($n|IN($excluded[])) then "Skipping \($n)..."
            elif (($sub|not) and (.subscription!=null and .subscription!="basic" and .subscription!="")) then
                 "\($n) integration requires an Elastic license of \(.subscription) or greater... skipping"
            elif (.installed_version==null or .installed_version=="") then
                 "\($n) is not installed... Adding to next update."
            elif ((.latest_version|vnum) > (.installed_version|vnum)) then
                 (if ($autoup or ($n|IN($inuse[])|not))
                  then "\($n) is at version \(.installed_version) latest version is \(.latest_version)... Adding to next update."
                  else "skipping available upgrade for in use integration - \($n)." end)
            else empty end
        ' "$INSTALLED_PACKAGE_LIST"
        # (b) The bulk install list, built in a single pass.
        jq "${JQ_ARGS[@]}" "$JQ_DECISION"'
          {packages: [ .packages[] | select(needs($sub;$autoup;$excluded;$inuse)) | {name, version: .latest_version} ]}
        ' "$INSTALLED_PACKAGE_LIST" > "$BULK_INSTALL_PACKAGE_LIST"
        if jq -e '.packages | length > 0' "$BULK_INSTALL_PACKAGE_LIST" >/dev/null; then
                        PENDING_UPDATE=true
                    else
                        results=$(compare_versions "$latest_version" "$installed_version")
                        if [ $results == "greater" ]; then
                            {#- When auto_upgrade_integrations is false, skip upgrading in_use_integrations  #}
                            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
                            if ! [[ " ${in_use_integrations[@]} " =~ " $package_name " ]]; then
                            {%- endif %}
                                echo "$package_name is at version $installed_version latest version is $latest_version... Adding to next update."
                                jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
                                PENDING_UPDATE=true
                            {%- if not AUTO_UPGRADE_INTEGRATIONS %}
                            else
                                echo "skipping available upgrade for in use integration - $package_name."
                            fi
                            {%- endif %}
                        fi
                    fi
                fi
            {% else %}
                if [[ "$installed_version" == "null" || -z "$installed_version" ]]; then
                    echo "$package_name is not installed... Adding to next update."
                    jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
                    PENDING_UPDATE=true
                else
                    results=$(compare_versions "$latest_version" "$installed_version")
                    if [ $results == "greater" ]; then
                        {#- When auto_upgrade_integrations is false, skip upgrading in_use_integrations  #}
                        {%- if not AUTO_UPGRADE_INTEGRATIONS %}
                        if ! [[ " ${in_use_integrations[@]} " =~ " $package_name " ]]; then
                        {%- endif %}
                            echo "$package_name is at version $installed_version latest version is $latest_version... Adding to next update."
                            jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
                            PENDING_UPDATE=true
                        {%- if not AUTO_UPGRADE_INTEGRATIONS %}
                        else
                            echo "skipping available upgrade for in use integration - $package_name."
                        fi
                        {%- endif %}
                    fi
                fi
            {% endif %}
            else
                echo "Skipping $package_name..."
            fi
        done <<< "$(jq -c '.packages[]' "$INSTALLED_PACKAGE_LIST")"
        if [ "$PENDING_UPDATE" = true ]; then
            # Run chunked install of packages
@@ -24,6 +24,7 @@ include:
 so-elasticsearch:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-elasticsearch:{{ ELASTICSEARCHMERGED.version }}
    - restart_policy: unless-stopped
    - hostname: elasticsearch
    - name: so-elasticsearch
    - user: elasticsearch
@@ -11,8 +11,10 @@ ADDON_STATEFILE_SUCCESS=/opt/so/state/addon_estemplates.txt
 ELASTICSEARCH_TEMPLATES_DIR="/opt/so/conf/elasticsearch/templates"
 SO_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR}/index"
 ADDON_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR}/addon-index"
-FAILED_NAMES=()
+SO_LOAD_FAILURES=0
-FAILED_COUNT=0
+ADDON_LOAD_FAILURES=0
 SO_LOAD_FAILURES_NAMES=()
 ADDON_LOAD_FAILURES_NAMES=()
 IS_HEAVYNODE="false"
 FORCE="false"
 VERBOSE="false"
@@ -44,86 +46,20 @@ while [[ $# -gt 0 ]]; do
    shift
 done
 # Max number of concurrent template PUT jobs. Override via env if needed.
 MAX_TEMPLATE_JOBS=${MAX_TEMPLATE_JOBS:-10}
 # Block until fewer than MAX_TEMPLATE_JOBS background jobs are running.
 template_throttle() {
    while (( $(jobs -rp | wc -l) >= MAX_TEMPLATE_JOBS )); do
        wait -n
    done
 }
 # Per-job failure markers and an output lock for serializing parallel job output.
 # Each failed load drops one file (named after the template) into FAIL_DIR; the
 # output of each job is flushed as a single block under flock so concurrent jobs
 # never interleave their (chatty) retry output.
 FAIL_DIR=$(mktemp -d)
 OUTPUT_LOCK="${FAIL_DIR}/.output.lock"
 : > "$OUTPUT_LOCK"
 trap 'rm -rf "$FAIL_DIR"' EXIT
 # Record a failure: $1 = the template name/path to report later. Slashes are
 # encoded so the path becomes a safe single filename.
 record_failure() {
    local marker="${1//\//__}"
    : > "${FAIL_DIR}/fail.${marker}"
 }
 # Populate FAILED_NAMES and FAILED_COUNT from the current phase's markers.
 # Must run in the current shell (not a command substitution) so the array sticks.
 collect_failures() {
    FAILED_NAMES=()
    FAILED_COUNT=0
    local f name
    shopt -s nullglob
    for f in "${FAIL_DIR}"/fail.*; do
        name="${f##*/fail.}"
        name="${name//__//}"
        FAILED_NAMES+=("$name")
        FAILED_COUNT=$((FAILED_COUNT + 1))
    done
    shopt -u nullglob
 }
 # Clear markers and names between phases so SO and addon counts stay independent.
 reset_failures() {
    shopt -s nullglob
    rm -f "${FAIL_DIR}"/fail.*
    shopt -u nullglob
    FAILED_NAMES=()
    FAILED_COUNT=0
 }
 # Print a block of text atomically (under the shared output lock) so the output
 # of concurrent background jobs is not interleaved.
 locked_echo() {
    { flock 9; printf '%s\n' "$1"; } 9>>"$OUTPUT_LOCK"
 }
 # Loads one template file via PUT. Intended to be dispatched as a background job.
 #   $1 uri          - e.g. _component_template/foo or _index_template/foo
 #   $2 file         - path to the template JSON
 #   $3 report_name  - name/path to record if this load fails
 load_template() {
    local uri="$1"
    local file="$2"
    local report_name="$3"
    local out rc=0 block
-    # Capture everything (including retry's diagnostic chatter) into one block so
+    echo "Loading template file $file"
-    # concurrent jobs never interleave; the whole block is flushed under one flock.
+    if ! output=$(retry 3 3 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}"); then
-    block="Loading template file $file"$'\n'
+        echo "$output"
-    if ! out=$(retry 3 3 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}" 2>&1); then
+
-        block+="$out"$'\n'
+        return 1
-        rc=1
+
    elif [[ "$VERBOSE" == "true" ]]; then
-        block+="$out"$'\n'
+        echo "$output"
    fi
    { flock 9; printf '%s' "$block"; } 9>>"$OUTPUT_LOCK"
    (( rc != 0 )) && record_failure "$report_name"
 }
 check_required_component_template_exists() {
@@ -174,9 +110,6 @@ load_component_templates() {
        return
    fi
    # Dispatch loads as throttled background jobs. The barrier (wait) happens in
    # the caller after all component groups have been dispatched, since index
    # templates must not load until every component template is in place.
    for component in "$pattern"/*.json; do
        tmpl_name=$(basename "${component%.json}")
@@ -185,8 +118,10 @@ load_component_templates() {
            tmpl_name="${tmpl_name%-mappings}-mappings"
        fi
-        template_throttle
+        if ! load_template "_component_template/${tmpl_name}" "$component"; then
-        load_template "_component_template/${tmpl_name}" "$component" "$component" &
+            SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
            SO_LOAD_FAILURES_NAMES+=("$component")
        fi
    done
 }
@@ -245,9 +180,6 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
    load_component_templates "Elastic Agent" "elastic-agent"
    load_component_templates "Security Onion" "so"
    # Barrier: every component template PUT must complete before we snapshot the
    # component template list and start loading index templates that depend on them.
    wait
    component_templates=$(so-elasticsearch-component-templates-list)
    echo -e "Loading Security Onion index templates...\n"
    for so_idx_tmpl in "${SO_TEMPLATES_DIR}"/*.json; do
@@ -257,7 +189,7 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
            # TODO: Better way to load only heavynode specific templates
            if ! check_heavynode_compatiable_index_template "$tmpl_name"; then
                if [[ "$VERBOSE" == "true" ]]; then
-                    locked_echo "Skipping over $so_idx_tmpl, template is not a heavynode specific index template."
+                    echo "Skipping over $so_idx_tmpl, template is not a heavynode specific index template."
                fi
                continue
@@ -265,34 +197,32 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
        fi
        if check_required_component_template_exists "$so_idx_tmpl"; then
-            template_throttle
+            if ! load_template "_index_template/$tmpl_name" "$so_idx_tmpl"; then
-            load_template "_index_template/$tmpl_name" "$so_idx_tmpl" "$so_idx_tmpl" &
+                SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
                SO_LOAD_FAILURES_NAMES+=("$so_idx_tmpl")
            fi
        else
-            locked_echo "Skipping over $so_idx_tmpl due to missing required component template(s)."
+            echo "Skipping over $so_idx_tmpl due to missing required component template(s)."
-            record_failure "$so_idx_tmpl"
+            SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
            SO_LOAD_FAILURES_NAMES+=("$so_idx_tmpl")
            continue
        fi
    done
-    # Barrier: all SO index template PUTs must finish before tallying failures.
+    if [[ $SO_LOAD_FAILURES -eq 0 ]]; then
    wait
    collect_failures
    if [[ $FAILED_COUNT -eq 0 ]]; then
        echo "All Security Onion core templates loaded successfully."
        touch "$SO_STATEFILE_SUCCESS"
    else
-        echo "Encountered $FAILED_COUNT failure(s) loading templates:"
+        echo "Encountered $SO_LOAD_FAILURES failure(s) loading templates:"
-        for failed_template in "${FAILED_NAMES[@]}"; do
+        for failed_template in "${SO_LOAD_FAILURES_NAMES[@]}"; do
            echo "  - $failed_template"
        done
        if [[ "$SHOULD_EXIT_ON_FAILURE" == "true" ]]; then
            fail "Failed to load all Security Onion core templates successfully."
        fi
    fi
    reset_failures
 elif ! index_templates_exist "$SO_TEMPLATES_DIR"; then
    echo "No Security Onion core index templates found in ${SO_TEMPLATES_DIR}, skipping."
 elif [[ -f "$SO_STATEFILE_SUCCESS" ]]; then
@@ -311,27 +241,26 @@ if should_load_addon_templates; then
        tmpl_name=$(basename "${addon_idx_tmpl%-template.json}")
        if check_required_component_template_exists "$addon_idx_tmpl"; then
-            template_throttle
+            if ! load_template "_index_template/${tmpl_name}" "$addon_idx_tmpl"; then
-            load_template "_index_template/${tmpl_name}" "$addon_idx_tmpl" "$addon_idx_tmpl" &
+                ADDON_LOAD_FAILURES=$((ADDON_LOAD_FAILURES + 1))
                ADDON_LOAD_FAILURES_NAMES+=("$addon_idx_tmpl")
            fi
        else
-            locked_echo "Skipping over $addon_idx_tmpl due to missing required component template(s)."
+            echo "Skipping over $addon_idx_tmpl due to missing required component template(s)."
-            record_failure "$addon_idx_tmpl"
+            ADDON_LOAD_FAILURES=$((ADDON_LOAD_FAILURES + 1))
            ADDON_LOAD_FAILURES_NAMES+=("$addon_idx_tmpl")
            continue
        fi
    done
-    # Barrier: all addon index template PUTs must finish before tallying failures.
+    if [[ $ADDON_LOAD_FAILURES -eq 0 ]]; then
    wait
    collect_failures
    if [[ $FAILED_COUNT -eq 0 ]]; then
        echo "All addon integration templates loaded successfully."
        touch "$ADDON_STATEFILE_SUCCESS"
    else
-        echo "Encountered $FAILED_COUNT failure(s) loading addon integration templates:"
+        echo "Encountered $ADDON_LOAD_FAILURES failure(s) loading addon integration templates:"
-        for failed_template in "${FAILED_NAMES[@]}"; do
+        for failed_template in "${ADDON_LOAD_FAILURES_NAMES[@]}"; do
            echo "  - $failed_template"
        done
        if [[ "$SHOULD_EXIT_ON_FAILURE" == "true" ]]; then
@@ -6,37 +6,6 @@
 . /usr/sbin/so-common
 MAX_JOBS=10
 # Lock used to serialize block writes so concurrent jobs never interleave their output.
 ILM_OUTPUT_LOCK=$(mktemp)
 trap 'rm -f "$ILM_OUTPUT_LOCK"' EXIT
 # Policies are loaded concurrently (up to MAX_JOBS at a time) for speed. Each policy's block is
 # printed the moment its curl returns, so output appears in COMPLETION ORDER, not the order
 # policies are defined in configuration.
 echo "Loading ILM policies concurrently; output below appears in completion order, not configuration order."
 echo
 put_policy() {
  local desc="$1" policyname="$2" data="$3" result
  result=$(curl -K /opt/so/conf/elasticsearch/curl.config -s -k -L \
    -X PUT "https://localhost:9200/_ilm/policy/${policyname}" \
    -H 'Content-Type: application/json' -d"${data}")
  # curl above ran in parallel; serialize just this block write so concurrent jobs never interleave.
  {
    flock 200
    printf 'Setting up %s policy...\n%s\n\n' "${desc}" "${result}"
  } 200>>"${ILM_OUTPUT_LOCK}"
 }
 # Block until fewer than MAX_JOBS background curls are running.
 throttle() {
  while (( $(jobs -rp | wc -l) >= MAX_JOBS )); do
    wait -n
  done
 }
 {%- from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS %}
 {%- if GLOBALS.role != "so-heavynode" %}
 {%-   from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS %}
@@ -45,26 +14,35 @@ throttle() {
 {%- for index, settings in ES_INDEX_SETTINGS.items() %}
 {%-   if settings.policy is defined %}
 {%-     if index == 'so-logs-detections.alerts' %}
-  throttle
+  echo
-  put_policy "so-logs-detections.alerts-so" "{{ index }}-so" '{ "policy": {{ settings.policy | tojson(true) }} }' &
+  echo "Setting up so-logs-detections.alerts-so policy..."
  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-so" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
  echo
 {%-     elif index == 'so-logs-soc' %}
-  throttle
+  echo
-  put_policy "so-soc-logs" "so-soc-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
+  echo "Setting up so-soc-logs policy..."
-  throttle
+  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/so-soc-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
-  put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
+  echo
  echo
  echo "Setting up {{ index }}-logs policy..."
  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
  echo
 {%-     else %}
-  throttle
+  echo
-  put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
+  echo "Setting up {{ index }}-logs policy..."
  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
  echo
 {%-     endif %}
 {%-   endif %}
 {%- endfor %}
 echo
 {%- if GLOBALS.role != "so-heavynode" %}
 {%-   for index, settings in ALL_ADDON_SETTINGS.items() %}
 {%-     if settings.policy is defined %}
-  throttle
+  echo
-  put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
+  echo "Setting up {{ index }}-logs policy..."
  curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
  echo
 {%-     endif %}
 {%-   endfor %}
 {%- endif %}
 wait
@@ -1,3 +1,10 @@
 global:
  pcapengine: SURICATA
  pipeline: REDIS
  push:
    enabled: true
    highstate_interval_hours: 2
    debounce_seconds: 30
    drain_interval: 15
    batch: '25%'
    batch_wait: 15
@@ -59,4 +59,41 @@ global:
    description: Allows use of Endgame with Security Onion. This feature requires a license from Endgame.
    global: True
    advanced: True
  push:
    enabled:
      description: Master kill-switch for the active push feature. When disabled, rule and pillar changes are picked up at the next scheduled highstate instead of being pushed immediately.
      forcedType: bool
      helpLink: push
      global: True
    highstate_interval_hours:
      description: How often every minion in the grid runs a scheduled state.highstate, in hours. Lower values keep minions closer in sync at the cost of more load; higher values reduce load but increase worst-case latency for non-pushed changes. The salt-minion health check restarts a minion if its last highstate is older than this value plus one hour.
      forcedType: int
      helpLink: push
      global: True
      advanced: True
    debounce_seconds:
      description: Trailing-edge debounce window in seconds. A push intent must be quiet for this long before the drainer dispatches. Rapid bursts of edits within this window coalesce into one dispatch.
      forcedType: int
      helpLink: push
      global: True
      advanced: True
    drain_interval:
      description: How often the push drainer checks for ready intents, in seconds. Small values lower dispatch latency at the cost of more background work on the manager.
      forcedType: int
      helpLink: push
      global: True
      advanced: True
    batch:
      description: "Host batch size for push orchestrations. A number (e.g. '10') or a percentage (e.g. '25%'). Limits how many minions run the push state at once so large fleets don't thundering-herd."
      helpLink: push
      global: True
      advanced: True
      regex: '^([0-9]+%?)$'
      regexFailureMessage: Enter a whole number or a whole-number percentage (e.g. 10 or 25%).
    batch_wait:
      description: Seconds to wait between host batches in a push orchestration. Gives the fleet time to breathe between waves.
      forcedType: int
      helpLink: push
      global: True
      advanced: True
@@ -58,6 +58,7 @@ so-hydra:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: hydraconfig
@@ -15,6 +15,7 @@ include:
 so-idh:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-idh:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-idh
    - detach: True
    - network_mode: host
@@ -18,6 +18,7 @@ include:
 so-influxdb:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-influxdb:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: influxdb
    - networks:
      - sobridge:
@@ -27,6 +27,7 @@ include:
 so-kafka:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kafka:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: so-kafka
    - name: so-kafka
    - networks:
@@ -16,6 +16,7 @@ include:
 so-kibana:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-kibana:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: kibana
    - user: kibana
    - networks:
@@ -51,6 +51,7 @@ so-kratos:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
    # Intentionally unless-stopped -- matches the fleet default.
    - restart_policy: unless-stopped
    - watch:
      - file: kratosschema
@@ -28,6 +28,7 @@ include:
 so-logstash:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-logstash:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: so-logstash
    - name: so-logstash
    - networks:
@@ -0,0 +1,21 @@
 {% from 'vars/globals.map.jinja' import GLOBALS %}
 {% from 'global/map.jinja' import GLOBALMERGED %}
 include:
  - salt.minion
 {% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
 salt_beacons_pushstate:
  file.managed:
    - name: /etc/salt/minion.d/beacons_pushstate.conf
    - source: salt://manager/files/beacons_pushstate.conf.jinja
    - template: jinja
    - watch_in:
      - service: salt_minion_service
 {% else %}
 salt_beacons_pushstate:
  file.absent:
    - name: /etc/salt/minion.d/beacons_pushstate.conf
    - watch_in:
      - service: salt_minion_service
 {% endif %}
@@ -0,0 +1,41 @@
 {% from 'global/map.jinja' import GLOBALMERGED %}
 beacons:
  pillar_db:
    - interval: {{ GLOBALMERGED.push.drain_interval }}
    - disable_during_state_run: True
  inotify:
    - disable_during_state_run: True
    - coalesce: True
    - files:
        /opt/so/saltstack/local/salt/suricata/rules:
          mask:
            - close_write
            - moved_to
            - delete
          recurse: True
          auto_add: True
          exclude:
            - '\.sw[a-z]$':
                regex: True
            - '~$':
                regex: True
            - '/4913$':
                regex: True
            - '/\.#':
                regex: True
        /opt/so/saltstack/local/salt/strelka/rules/compiled:
          mask:
            - close_write
            - moved_to
            - delete
          recurse: True
          auto_add: True
          exclude:
            - '\.sw[a-z]$':
                regex: True
            - '~$':
                regex: True
            - '/4913$':
                regex: True
            - '/\.#':
                regex: True
@@ -15,6 +15,7 @@ include:
  - manager.elasticsearch
  - manager.kibana
  - manager.managed_soc_annotations
  - manager.beacons
 repo_log_dir:
  file.directory:
@@ -231,6 +232,7 @@ surifiltersrules:
    - user: 939
    - group: 939
 {% else %}
 {{sls}}_state_not_allowed:
@@ -0,0 +1,232 @@
 #!/opt/saltstack/salt/bin/python3
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 """
 so-push-drainer
 ===============
 Scheduled drainer for the active-push feature. Runs on the manager every
 drain_interval seconds (default 15) via a salt schedule in salt/schedule.sls.
 For each intent file under /opt/so/state/push_pending/*.json whose last_touch
 is older than debounce_seconds, this script:
  * concatenates the actions lists from every ready intent
  * dedupes by (state or __highstate__, tgt, tgt_type)
  * dispatches a single `salt-run state.orchestrate orch.push_batch --async`
    with the deduped actions list passed as pillar kwargs
  * deletes the contributed intent files on successful dispatch
 Reactor sls files (push_suricata, push_strelka, push_pillar) write intents
 but never dispatch directly -- see plan
 /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md for the full design.
 """
 import fcntl
 import glob
 import json
 import logging
 import logging.handlers
 import os
 import subprocess
 import sys
 import time
 import salt.client
 PENDING_DIR = '/opt/so/state/push_pending'
 LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
 LOG_FILE = '/opt/so/log/salt/so-push-drainer.log'
 HIGHSTATE_SENTINEL = '__highstate__'
 def _make_logger():
    logger = logging.getLogger('so-push-drainer')
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
        handler = logging.handlers.RotatingFileHandler(
            LOG_FILE, maxBytes=5 * 1024 * 1024, backupCount=3,
        )
        handler.setFormatter(logging.Formatter(
            '%(asctime)s | %(levelname)s | %(message)s',
        ))
        logger.addHandler(handler)
    return logger
 def _load_push_cfg():
    """Read the global:push pillar subtree via salt-call. Returns a dict."""
    caller = salt.client.Caller()
    cfg = caller.cmd('pillar.get', 'global:push', {})
    return cfg if isinstance(cfg, dict) else {}
 def _read_intent(path, log):
    try:
        with open(path, 'r') as f:
            return json.load(f)
    except (IOError, ValueError) as exc:
        log.warning('cannot read intent %s: %s', path, exc)
        return None
    except Exception:
        log.exception('unexpected error reading %s', path)
        return None
 def _dedupe_actions(actions):
    seen = set()
    deduped = []
    for action in actions:
        if not isinstance(action, dict):
            continue
        state_key = HIGHSTATE_SENTINEL if action.get('highstate') else action.get('state')
        tgt = action.get('tgt')
        tgt_type = action.get('tgt_type', 'compound')
        if not state_key or not tgt:
            continue
        key = (state_key, tgt, tgt_type)
        if key in seen:
            continue
        seen.add(key)
        deduped.append(action)
    return deduped
 def _dispatch(actions, log):
    pillar_arg = json.dumps({'actions': actions})
    cmd = [
        'salt-run',
        'state.orchestrate',
        'orch.push_batch',
        'pillar={}'.format(pillar_arg),
        '--async',
    ]
    log.info('dispatching: %s', ' '.join(cmd[:3]) + ' pillar=<{} actions>'.format(len(actions)))
    try:
        result = subprocess.run(
            cmd, check=True, capture_output=True, text=True, timeout=60,
        )
    except subprocess.CalledProcessError as exc:
        log.error('dispatch failed (rc=%s): stdout=%s stderr=%s',
                  exc.returncode, exc.stdout, exc.stderr)
        return False
    except subprocess.TimeoutExpired:
        log.error('dispatch timed out after 60s')
        return False
    except Exception:
        log.exception('dispatch raised')
        return False
    log.info('dispatch accepted: %s', (result.stdout or '').strip())
    return True
 def main():
    log = _make_logger()
    if not os.path.isdir(PENDING_DIR):
        # Nothing to do; reactors create the dir on first use.
        return 0
    try:
        push = _load_push_cfg()
    except Exception:
        log.exception('failed to read global:push pillar; aborting drain pass')
        return 1
    if not push.get('enabled', True):
        log.debug('push disabled; exiting')
        return 0
    debounce_seconds = int(push.get('debounce_seconds', 30))
    os.makedirs(PENDING_DIR, exist_ok=True)
    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX)
        intent_files = [
            p for p in sorted(glob.glob(os.path.join(PENDING_DIR, '*.json')))
            if os.path.basename(p) != '.lock'
        ]
        if not intent_files:
            return 0
        now = time.time()
        ready = []
        skipped = 0
        broken = []
        for path in intent_files:
            intent = _read_intent(path, log)
            if not isinstance(intent, dict):
                broken.append(path)
                continue
            last_touch = intent.get('last_touch', 0)
            if now - last_touch < debounce_seconds:
                skipped += 1
                continue
            ready.append((path, intent))
        for path in broken:
            try:
                os.unlink(path)
            except OSError:
                pass
        if not ready:
            if skipped:
                log.debug('no ready intents (%d still in debounce window)', skipped)
            return 0
        combined_actions = []
        oldest_first_touch = now
        all_paths = []
        for path, intent in ready:
            combined_actions.extend(intent.get('actions', []) or [])
            first = intent.get('first_touch', now)
            if first < oldest_first_touch:
                oldest_first_touch = first
            all_paths.extend(intent.get('paths', []) or [])
        deduped = _dedupe_actions(combined_actions)
        if not deduped:
            log.warning('%d intent(s) had no usable actions; clearing', len(ready))
            for path, _ in ready:
                try:
                    os.unlink(path)
                except OSError:
                    pass
            return 0
        debounce_duration = now - oldest_first_touch
        log.info(
            'draining %d intent(s): %d action(s) after dedupe (raw=%d), '
            'debounce_duration=%.1fs, paths=%s',
            len(ready), len(deduped), len(combined_actions),
            debounce_duration, all_paths[:20],
        )
        if not _dispatch(deduped, log):
            log.warning('dispatch failed; leaving intent files in place for retry')
            return 1
        for path, _ in ready:
            try:
                os.unlink(path)
            except OSError:
                log.exception('failed to remove drained intent %s', path)
        return 0
    finally:
        try:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
        finally:
            os.close(lock_fd)
 if __name__ == '__main__':
    sys.exit(main())
@@ -343,11 +343,10 @@ highstate() {
 masterlock() {
  echo "Locking Salt Master"
  mv -v $TOPFILE $BACKUPTOPFILE
-  # Render the real top file only for the host running soup; every other
+  echo "base:" > $TOPFILE
-  # minion gets an empty top (no states) while the master is upgrading.
+  echo "  $MINIONID:" >> $TOPFILE
-  echo "{% if grains['id'] == '$MINIONID' %}" > $TOPFILE
+  echo "    - ca" >> $TOPFILE
-  cat $BACKUPTOPFILE >> $TOPFILE
+  echo "    - elasticsearch" >> $TOPFILE
  echo "{% endif %}" >> $TOPFILE
 }
 masterunlock() {
@@ -34,6 +34,7 @@ make-rule-dir-nginx:
 so-nginx:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-nginx:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: so-nginx
    - networks:
      - sobridge:
@@ -0,0 +1,37 @@
 {% from 'global/map.jinja' import GLOBALMERGED %}
 {% set actions = salt['pillar.get']('actions', []) %}
 {% set BATCH = GLOBALMERGED.push.batch %}
 {% set BATCH_WAIT = GLOBALMERGED.push.batch_wait %}
 {% for action in actions %}
 {%   if action.get('highstate') %}
 apply_highstate_{{ loop.index }}:
  salt.state:
    - tgt: '{{ action.tgt }}'
    - tgt_type: {{ action.get('tgt_type', 'compound') }}
    - highstate: True
    - batch: {{ action.get('batch', BATCH) }}
    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
    - kwarg:
        queue: 2
 {%   else %}
 refresh_pillar_{{ loop.index }}:
  salt.function:
    - name: saltutil.refresh_pillar
    - tgt: '{{ action.tgt }}'
    - tgt_type: {{ action.get('tgt_type', 'compound') }}
 apply_{{ action.state | replace('.', '_') }}_{{ loop.index }}:
  salt.state:
    - tgt: '{{ action.tgt }}'
    - tgt_type: {{ action.get('tgt_type', 'compound') }}
    - sls:
      - {{ action.state }}
    - batch: {{ action.get('batch', BATCH) }}
    - batch_wait: {{ action.get('batch_wait', BATCH_WAIT) }}
    - kwarg:
        queue: 2
    - require:
      - salt: refresh_pillar_{{ loop.index }}
 {%   endif %}
 {% endfor %}
@@ -0,0 +1,240 @@
 # One pillar directory can map to multiple (state, tgt) actions.
 # tgt is a raw salt compound expression. tgt_type is always "compound".
 # Per-action `batch` / `batch_wait` override the orch defaults (25% / 15s).
 # An action with `highstate: True` triggers state.highstate instead of
 # state.apply -- see salt/orch/push_batch.sls.
 #
 # Notes:
 #   - `bpf` is a pillar-only dir (no state of its own) consumed by both
 #     zeek and suricata via macros, so a bpf pillar change re-applies both.
 #   - suricata/strelka/zeek/elasticsearch/redis/kafka/logstash etc. have
 #     their own pillar dirs AND their own state, so they map 1:1 (or 1:2
 #     in strelka's case, because of the split init.sls / manager.sls).
 #
 # Intentional omissions (these will log a "not in pillar_push_map.yaml"
 # warning in push_pillar.sls and wait for the next scheduled highstate):
 #   - `data` and `node_data`: pillar-only data consumed by many states;
 #     handling them generically would amount to a fleetwide highstate.
 #   - `host`: soc_host describes mainint/mainip; a change is a re-IP and
 #     needs a coordinated procedure, not an immediate state push.
 #   - `hypervisor`: state changes touch libvirt and are disruptive; leave
 #     to the next scheduled highstate.
 #   - `sensor`: every field in soc_sensor.yaml is `readonly: True` or
 #     per-minion (`node: True`). Per-minion edits are persisted under
 #     pillar/minions/<id>.sls and are handled by Branch A of push_pillar.sls
 #     (per-minion highstate intent), not by this app-pillar map.
 #
 # The role sets here were verified line-by-line against salt/top.sls. If
 # salt/top.sls changes how an app is targeted, update the corresponding
 # compound here.
 # firewall: the one pillar everyone touches. Applied everywhere intentionally
 # because every host's iptables needs to know about every other host in the
 # grid. Salt's firewall state is idempotent (file.managed + iptables-restore
 # onchanges in salt/firewall/init.sls), so hosts whose rendered firewall is
 # unchanged do a file comparison and no-op without touching iptables -- actual
 # reload happens only on the hosts whose rules actually changed. Fleetwide
 # blast radius is intentional and matches the pre-plan behavior via highstate.
 # Adding N sensors in a burst coalesces into one dispatch via the drainer.
 firewall:
  - state: firewall
    tgt: '*'
 # backup: backup.config_backup runs on eval, standalone, manager, managerhype,
 # managersearch (NOT import -- the backup pillar is included on import per
 # pillar/top.sls but the backup state is not run there per salt/top.sls).
 backup:
  - state: backup.config_backup
    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # bpf is pillar-only (no state); consumed by both zeek and suricata as macros.
 # Both states run on sensor_roles + so-import per salt/top.sls.
 bpf:
  - state: zeek
    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
  - state: suricata
    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
 # ca is applied universally.
 ca:
  - state: ca
    tgt: '*'
 # docker: universal. The docker state is in both the all-non-managers and
 # all-managers branches of salt/top.sls.
 docker:
  - state: docker
    tgt: '*'
 # elastalert: eval, standalone, manager, managerhype, managersearch (NOT import).
 elastalert:
  - state: elastalert
    tgt: 'G@role:so-eval or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # elastic-fleet-package-registry: manager_roles exactly.
 elastic-fleet-package-registry:
  - state: elastic-fleet-package-registry
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # elasticsearch: 8 roles.
 elasticsearch:
  - state: elasticsearch
    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-standalone'
 # elasticagent: so-heavynode only.
 elasticagent:
  - state: elasticagent
    tgt: 'G@role:so-heavynode'
 # elasticfleet: base state only on pillar change. elasticfleet.install_agent_grid
 # is a deploy/enrollment step, not a config reload; leave it to the next highstate.
 elasticfleet:
  - state: elasticfleet
    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # global: fanout to a fleetwide highstate. The global pillar (soc_global.sls)
 # carries cross-cutting settings (pipeline, url_base, imagerepo, mdengine, ...)
 # that are consumed by virtually every state, so a targeted re-apply isn't
 # meaningful. The drainer's batch/batch_wait throttling controls blast radius.
 global:
  - highstate: True
    tgt: '*'
 # healthcheck: eval, sensor, standalone only.
 healthcheck:
  - state: healthcheck
    tgt: 'G@role:so-eval or G@role:so-sensor or G@role:so-standalone'
 # hydra: manager_roles exactly.
 hydra:
  - state: hydra
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # idh: so-idh only.
 idh:
  - state: idh
    tgt: 'G@role:so-idh'
 # influxdb: manager_roles exactly.
 influxdb:
  - state: influxdb
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # kafka: standalone, manager, managerhype, managersearch, searchnode, receiver.
 kafka:
  - state: kafka
    tgt: 'G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
 # kibana: manager_roles exactly.
 kibana:
  - state: kibana
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # kratos: manager_roles exactly.
 kratos:
  - state: kratos
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # logrotate: universal (top-of-file '*' branch in salt/top.sls).
 logrotate:
  - state: logrotate
    tgt: '*'
 # logstash: 8 roles, no eval/import.
 logstash:
  - state: logstash
    tgt: 'G@role:so-fleet or G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-standalone'
 # manager: manager_roles exactly. The manager state is also referenced under
 # *_sensor / *_heavynode top.sls blocks via `sensor`, but the standalone
 # `manager` state itself runs only on manager_roles.
 manager:
  - state: manager
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # nginx: 10 specific roles. NOT receiver, idh, hypervisor, desktop.
 nginx:
  - state: nginx
    tgt: 'G@role:so-eval or G@role:so-fleet or G@role:so-heavynode or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
 # ntp: universal (top-of-file '*' branch in salt/top.sls).
 ntp:
  - state: ntp
    tgt: '*'
 # patch: universal. soc_patch carries the OS update schedule, applied via
 # patch.os.schedule on every node (it's in both the all-non-managers and
 # all-managers branches of salt/top.sls).
 patch:
  - state: patch.os.schedule
    tgt: '*'
 # postgres: manager_roles exactly.
 postgres:
  - state: postgres
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # redis: 6 roles. standalone, manager, managerhype, managersearch, heavynode, receiver.
 # (NOT eval, NOT import, NOT searchnode.)
 redis:
  - state: redis
    tgt: 'G@role:so-heavynode or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-standalone'
 # registry: manager_roles exactly.
 registry:
  - state: registry
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # sensoroni: universal.
 sensoroni:
  - state: sensoroni
    tgt: '*'
 # soc: manager_roles exactly.
 soc:
  - state: soc
    tgt: 'G@role:so-eval or G@role:so-import or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-standalone'
 # stig: broad. Runs on standalone, manager, managerhype, managersearch,
 # searchnode, sensor, receiver, fleet, hypervisor, desktop.
 # NOT eval, NOT import, NOT heavynode, NOT idh (the *_idh block in
 # salt/top.sls intentionally omits stig).
 stig:
  - state: stig
    tgt: 'G@role:so-desktop or G@role:so-fleet or G@role:so-hypervisor or G@role:so-manager or G@role:so-managerhype or G@role:so-managersearch or G@role:so-receiver or G@role:so-searchnode or G@role:so-sensor or G@role:so-standalone'
 # strelka: sensor-side only on pillar change (sensor_roles). strelka.manager is
 # intentionally NOT fired on pillar changes -- YARA rule and strelka config
 # pillar changes are consumed by the sensor-side strelka backend, and re-running
 # strelka.manager on managers is both unnecessary and disruptive. strelka.manager
 # is left to the 2-hour highstate.
 strelka:
  - state: strelka
    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-sensor or G@role:so-standalone'
 # suricata: sensor_roles + so-import (5 roles).
 suricata:
  - state: suricata
    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
 # telegraf: universal.
 telegraf:
  - state: telegraf
    tgt: '*'
 # versionlock: universal (top-of-file '*' branch in salt/top.sls).
 versionlock:
  - state: versionlock
    tgt: '*'
 # vm: libvirt-driver hypervisors only. Matched by the salt-cloud:driver:libvirt
 # grain (compound supports nested grain matching via G@<key>:<subkey>:<value>).
 # pillar/vm/soc_vm.sls write path is referenced at salt/_runners/setup_hypervisor.py:856.
 vm:
  - state: vm
    tgt: 'G@salt-cloud:driver:libvirt'
 # zeek: sensor_roles + so-import (5 roles).
 zeek:
  - state: zeek
    tgt: 'G@role:so-eval or G@role:so-heavynode or G@role:so-import or G@role:so-sensor or G@role:so-standalone'
@@ -0,0 +1,176 @@
 #!py
 # Reactor invoked by the pillar_db beacon when SOC records settings changes in
 # the so_soc.audit_settings table (see salt/_beacons/pillar_db.py). The beacon
 # emits one event per new row carrying setting_id and node_id.
 #
 # Two branches, keyed on node_id:
 #   A) node_id populated -> the change is scoped to that one minion. Look up the
 #      app in pillar_push_map.yaml and write an intent that runs the app's mapped
 #      state(s) targeted to just that node.
 #   B) node_id empty -> grid-wide app change. Look up the app in
 #      pillar_push_map.yaml and write an intent with the entry's actions as-is.
 #
 # The app name is the first dotted segment of setting_id (e.g. "telegraf.output"
 # -> "telegraf"), which matches the pillar_push_map.yaml keys 1:1.
 #
 # Reactors never dispatch directly. The so-push-drainer schedule picks up
 # ready intents, dedupes across pending files, and dispatches orch.push_batch.
 import fcntl
 import json
 import logging
 import os
 import time
 from salt.client import Caller
 import yaml
 LOG = logging.getLogger(__name__)
 PENDING_DIR = '/opt/so/state/push_pending'
 LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
 MAX_PATHS = 20
 # The pillar_push_map.yaml is shipped via salt:// but the reactor runs on the
 # master, which mounts the default saltstack tree at this path.
 PUSH_MAP_PATH = '/opt/so/saltstack/default/salt/reactor/pillar_push_map.yaml'
 _PUSH_MAP_CACHE = {'mtime': 0, 'data': None}
 def _load_push_map():
    try:
        st = os.stat(PUSH_MAP_PATH)
    except OSError:
        LOG.warning('push_pillar: %s not found', PUSH_MAP_PATH)
        return {}
    if _PUSH_MAP_CACHE['mtime'] != st.st_mtime:
        try:
            with open(PUSH_MAP_PATH, 'r') as f:
                _PUSH_MAP_CACHE['data'] = yaml.safe_load(f) or {}
        except Exception:
            LOG.exception('push_pillar: failed to load %s', PUSH_MAP_PATH)
            _PUSH_MAP_CACHE['data'] = {}
        _PUSH_MAP_CACHE['mtime'] = st.st_mtime
    return _PUSH_MAP_CACHE['data'] or {}
 def _push_enabled():
    try:
        caller = Caller()
        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
    except Exception:
        LOG.exception('push_pillar: pillar.get global:push:enabled failed, assuming enabled')
        return True
 def _write_intent(key, actions, path):
    now = time.time()
    try:
        os.makedirs(PENDING_DIR, exist_ok=True)
    except OSError:
        LOG.exception('push_pillar: cannot create %s', PENDING_DIR)
        return
    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX)
        intent = {}
        if os.path.exists(intent_path):
            try:
                with open(intent_path, 'r') as f:
                    intent = json.load(f)
            except (IOError, ValueError):
                intent = {}
        intent.setdefault('first_touch', now)
        intent['last_touch'] = now
        intent['actions'] = actions
        paths = intent.get('paths', [])
        if path and path not in paths:
            paths.append(path)
            paths = paths[-MAX_PATHS:]
        intent['paths'] = paths
        tmp_path = intent_path + '.tmp'
        with open(tmp_path, 'w') as f:
            json.dump(intent, f)
        os.rename(tmp_path, intent_path)
    except Exception:
        LOG.exception('push_pillar: failed to write intent %s', intent_path)
    finally:
        try:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
        finally:
            os.close(lock_fd)
 def _app_from_setting(setting_id):
    # setting_id is e.g. 'telegraf.output' -> 'telegraf', 'ntp.config.servers' -> 'ntp'
    if not setting_id:
        return None
    return setting_id.split('.', 1)[0] or None
 def _node_actions(entry, node_id):
    # Copy the app's mapped actions but retarget each one to the single node.
    # Preserves the state/highstate selection and any batch/batch_wait overrides.
    actions = []
    for action in entry:
        if not isinstance(action, dict):
            continue
        node_action = dict(action)
        node_action['tgt'] = node_id
        node_action['tgt_type'] = 'glob'
        actions.append(node_action)
    return actions
 def run():
    if not _push_enabled():
        LOG.info('push_pillar: push disabled, skipping')
        return {}
    # The pillar_db beacon nests its payload under data['data']; fall back to the
    # top level so the reactor is robust to either shape.
    event = data.get('data', data)  # noqa: F821 -- data provided by reactor
    setting_id = event.get('setting_id', '')
    node_id = (event.get('node_id') or '').strip()
    app = _app_from_setting(setting_id)
    if not app:
        LOG.debug('push_pillar: ignoring event with no app segment: setting_id=%s', setting_id)
        return {}
    push_map = _load_push_map()
    entry = push_map.get(app)
    if not entry:
        LOG.warning(
            'push_pillar: app "%s" is not in pillar_push_map.yaml; change will be '
            'picked up at the next scheduled highstate (setting_id=%s)',
            app, setting_id,
        )
        return {}
    # Branch A: per-node change -> retarget the app's states to just that node.
    if node_id:
        actions = _node_actions(entry, node_id)
        if not actions:
            LOG.warning('push_pillar: no usable actions for app "%s" (setting_id=%s)', app, setting_id)
            return {}
        _write_intent(
            'node_{}_{}'.format(node_id, app), actions,
            'audit:{}@{}'.format(setting_id, node_id),
        )
        LOG.info('push_pillar: per-node intent updated for %s on %s (setting_id=%s)',
                 app, node_id, setting_id)
        return {}
    # Branch B: grid-wide app change -> use the map entry's actions as-is.
    actions = list(entry)  # copy to avoid mutating the cache
    _write_intent('pillar_{}'.format(app), actions, 'audit:{}'.format(setting_id))
    LOG.info('push_pillar: app intent updated for %s (setting_id=%s)', app, setting_id)
    return {}
@@ -0,0 +1,96 @@
 #!py
 # Reactor invoked by the inotify beacon on rule file changes under
 # /opt/so/saltstack/local/salt/strelka/rules/compiled/.
 #
 # Writes (or updates) a push intent at /opt/so/state/push_pending/rules_strelka.json
 # and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
 # across pending files, and dispatches orch.push_batch. Reactors never dispatch
 # directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
 import fcntl
 import json
 import logging
 import os
 import time
 from salt.client import Caller
 LOG = logging.getLogger(__name__)
 PENDING_DIR = '/opt/so/state/push_pending'
 LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
 MAX_PATHS = 20
 # Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Sensor-side
 # strelka runs on exactly these four roles; so-import gets strelka.manager
 # instead, which is not fired on pillar changes.
 SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
 def _sensor_compound():
    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES)
 def _push_enabled():
    try:
        caller = Caller()
        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
    except Exception:
        LOG.exception('push_strelka: pillar.get global:push:enabled failed, assuming enabled')
        return True
 def _write_intent(key, actions, path):
    now = time.time()
    try:
        os.makedirs(PENDING_DIR, exist_ok=True)
    except OSError:
        LOG.exception('push_strelka: cannot create %s', PENDING_DIR)
        return
    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX)
        intent = {}
        if os.path.exists(intent_path):
            try:
                with open(intent_path, 'r') as f:
                    intent = json.load(f)
            except (IOError, ValueError):
                intent = {}
        intent.setdefault('first_touch', now)
        intent['last_touch'] = now
        intent['actions'] = actions
        paths = intent.get('paths', [])
        if path and path not in paths:
            paths.append(path)
            paths = paths[-MAX_PATHS:]
        intent['paths'] = paths
        tmp_path = intent_path + '.tmp'
        with open(tmp_path, 'w') as f:
            json.dump(intent, f)
        os.rename(tmp_path, intent_path)
    except Exception:
        LOG.exception('push_strelka: failed to write intent %s', intent_path)
    finally:
        try:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
        finally:
            os.close(lock_fd)
 def run():
    if not _push_enabled():
        LOG.info('push_strelka: push disabled, skipping')
        return {}
    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
    actions = [{'state': 'strelka', 'tgt': _sensor_compound()}]
    _write_intent('rules_strelka', actions, path)
    LOG.info('push_strelka: intent updated for path=%s', path)
    return {}
@@ -0,0 +1,95 @@
 #!py
 # Reactor invoked by the inotify beacon on rule file changes under
 # /opt/so/saltstack/local/salt/suricata/rules/.
 #
 # Writes (or updates) a push intent at /opt/so/state/push_pending/rules_suricata.json
 # and returns {}. The so-push-drainer schedule picks up ready intents, dedupes
 # across pending files, and dispatches orch.push_batch. Reactors never dispatch
 # directly -- see plan /home/mreeves/.claude/plans/goofy-marinating-hummingbird.md.
 import fcntl
 import json
 import logging
 import os
 import time
 from salt.client import Caller
 LOG = logging.getLogger(__name__)
 PENDING_DIR = '/opt/so/state/push_pending'
 LOCK_FILE = os.path.join(PENDING_DIR, '.lock')
 MAX_PATHS = 20
 # Mirrors GLOBALS.sensor_roles in salt/vars/globals.map.jinja. Suricata also
 # runs on so-import per salt/top.sls, so that role is appended below.
 SENSOR_ROLES = ['so-eval', 'so-heavynode', 'so-sensor', 'so-standalone']
 def _sensor_compound_plus_import():
    return ' or '.join('G@role:{}'.format(r) for r in SENSOR_ROLES) + ' or G@role:so-import'
 def _push_enabled():
    try:
        caller = Caller()
        return bool(caller.cmd('pillar.get', 'global:push:enabled', True))
    except Exception:
        LOG.exception('push_suricata: pillar.get global:push:enabled failed, assuming enabled')
        return True
 def _write_intent(key, actions, path):
    now = time.time()
    try:
        os.makedirs(PENDING_DIR, exist_ok=True)
    except OSError:
        LOG.exception('push_suricata: cannot create %s', PENDING_DIR)
        return
    intent_path = os.path.join(PENDING_DIR, '{}.json'.format(key))
    lock_fd = os.open(LOCK_FILE, os.O_CREAT | os.O_RDWR, 0o644)
    try:
        fcntl.flock(lock_fd, fcntl.LOCK_EX)
        intent = {}
        if os.path.exists(intent_path):
            try:
                with open(intent_path, 'r') as f:
                    intent = json.load(f)
            except (IOError, ValueError):
                intent = {}
        intent.setdefault('first_touch', now)
        intent['last_touch'] = now
        intent['actions'] = actions
        paths = intent.get('paths', [])
        if path and path not in paths:
            paths.append(path)
            paths = paths[-MAX_PATHS:]
        intent['paths'] = paths
        tmp_path = intent_path + '.tmp'
        with open(tmp_path, 'w') as f:
            json.dump(intent, f)
        os.rename(tmp_path, intent_path)
    except Exception:
        LOG.exception('push_suricata: failed to write intent %s', intent_path)
    finally:
        try:
            fcntl.flock(lock_fd, fcntl.LOCK_UN)
        finally:
            os.close(lock_fd)
 def run():
    if not _push_enabled():
        LOG.info('push_suricata: push disabled, skipping')
        return {}
    path = data.get('path', '')  # noqa: F821 -- data provided by reactor
    actions = [{'state': 'suricata', 'tgt': _sensor_compound_plus_import()}]
    _write_intent('rules_suricata', actions, path)
    LOG.info('push_suricata: intent updated for path=%s', path)
    return {}
@@ -17,6 +17,7 @@ include:
 so-redis:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: so-redis
    - user: socore
    - networks:
@@ -21,6 +21,9 @@ so-dockerregistry:
    - networks:
      - sobridge:
        - ipv4_address: {{ DOCKERMERGED.containers['so-dockerregistry'].ip }}
    # Intentionally `always` (not unless-stopped) -- registry is critical infra
    # and must come back up even if it was manually stopped. Do not homogenize
    # to unless-stopped; see the container auto-restart section of the plan.
    - restart_policy: always
    - port_bindings:
      {% for BINDING in DOCKERMERGED.containers['so-dockerregistry'].port_bindings %}
@@ -3,7 +3,7 @@
 {% set SCHEDULE = salt['pillar.get']('healthcheck:schedule', 30) %}
 include:
-  - salt
+  - salt.minion
 {% if CHECKS and ENABLED %}
 salt_beacons:
@@ -23,3 +23,4 @@ salt_beacons:
    - watch_in:
      - service: salt_minion_service
 {% endif %}
@@ -0,0 +1,11 @@
 reactor:
  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules':
    - salt://reactor/push_suricata.sls
  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/suricata/rules/*':
    - salt://reactor/push_suricata.sls
  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled':
    - salt://reactor/push_strelka.sls
  - 'salt/beacon/*/inotify//opt/so/saltstack/local/salt/strelka/rules/compiled/*':
    - salt://reactor/push_strelka.sls
  - 'salt/beacon/*/pillar_db/audit_settings':
    - salt://reactor/push_pillar.sls
@@ -5,3 +5,11 @@ salt_bootstrap:
    - source: salt://salt/scripts/bootstrap-salt.sh
    - mode: 755
    - show_changes: False
 salt_sbin:
  file.recurse:
    - name: /usr/sbin
    - source: salt://salt/tools/sbin
    - user: 939
    - group: 939
    - file_mode: 755
@@ -1,4 +1,4 @@
 lasthighstate:
  file.touch:
    - name: /opt/so/log/salt/lasthighstate
-    - order: last
+    - order: 9001
@@ -10,10 +10,12 @@
 #    software that is protected by the license key."
 {% from 'allowed_states.map.jinja' import allowed_states %}
 {% from 'global/map.jinja' import GLOBALMERGED %}
 {% if sls in allowed_states %}
 include:
  - salt.minion
  - salt.master.pyinotify
  - salt.master.boot_mine_update
 {%   if 'vrt' in salt['pillar.get']('features', []) %}
  - salt.cloud
@@ -63,6 +65,21 @@ engines_config:
    - name: /etc/salt/master.d/engines.conf
    - source: salt://salt/files/engines.conf
 {% if GLOBALMERGED.push.enabled %}
 reactor_pushstate_config:
  file.managed:
    - name: /etc/salt/master.d/reactor_pushstate.conf
    - source: salt://salt/files/reactor_pushstate.conf
    - watch_in:
      - service: salt_master_service
 {% else %}
 reactor_pushstate_config:
  file.absent:
    - name: /etc/salt/master.d/reactor_pushstate.conf
    - watch_in:
      - service: salt_master_service
 {% endif %}
 # update the bootstrap script when used for salt-cloud
 salt_bootstrap_cloud:
  file.managed:
@@ -78,7 +95,7 @@ salt_master_service:
      - file: checkmine_engine
      - file: pillarWatch_engine
      - file: engines_config
-    - order: last
+    - order: 9002
 {% else %}
@@ -0,0 +1,20 @@
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 pyinotify_module_package:
  file.recurse:
    - name: /opt/so/conf/salt/module_packages/pyinotify
    - source: salt://salt/module_packages/pyinotify
    - clean: True
    - makedirs: True
 pyinotify_python_module_install:
  cmd.run:
    - name: /opt/saltstack/salt/bin/python3.10 -m pip install pyinotify --no-index --find-links=/opt/so/conf/salt/module_packages/pyinotify/ --upgrade
    - onchanges:
      - file: pyinotify_module_package
    - failhard: True
    - watch_in:
      - service: salt_minion_service
@@ -2,4 +2,3 @@
 salt:
  minion:
    version: '3006.19'
    check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default
@@ -111,13 +111,17 @@ mark_setup_complete_for_upgrades:
 {% endif %}
-# this has to be outside the if statement above since there are <requisite>_in calls to this state
+# this has to be outside the if statement above since there are <requisite>_in calls to this state.
 # uses watch (not listen) so the restart fires in-state and its result lands on this state's
 # running entry; that is what lets wait_for_salt_minion_ready below detect any restart
 # uniformly via onchanges, regardless of whether the trigger came from these files or from
 # external watch_in's (e.g. beacons, master/pyinotify).
 salt_minion_service:
  service.running:
    - name: salt-minion
    - enable: True
    - onlyif: test "{{INSTALLEDSALTVERSION}}" == "{{SALTVERSION}}"
-    - listen:
+    - watch:
      - file: mine_functions
 {% if INSTALLEDSALTVERSION|string == SALTVERSION|string %}
      - file: set_log_levels
@@ -126,3 +130,17 @@ salt_minion_service:
      - file: signing_policy
 {% endif %}
    - order: last
 # block until the just-restarted salt-minion is back and can execute modules locally, so
 # follow-on jobs and the next highstate iteration do not race the restart. onchanges +
 # require on salt_minion_service catches every restart trigger uniformly because watch
 # mod_watch results replace the service state's running entry. wait logic lives in
 # /usr/sbin/so-salt-minion-wait (deployed by common_sbin from common/tools/sbin/).
 wait_for_salt_minion_ready:
  cmd.run:
    - name: /usr/sbin/so-salt-minion-wait
    - onchanges:
      - service: salt_minion_service
    - require:
      - service: salt_minion_service
    - order: last
@@ -0,0 +1,35 @@
 #!/bin/bash
 #
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 # Block until the local salt-minion service is back up and can execute modules locally.
 # Invoked from the wait_for_salt_minion_ready state in salt/minion/init.sls after
 # salt_minion_service fires its watch-driven mod_watch (a non-blocking systemctl restart),
 # so follow-on jobs and the next highstate iteration do not race the in-flight restart.
 . /usr/sbin/so-common
 # Initial sleep gives the systemctl restart (--no-block by default for salt-minion on
 # >=3006.15) time to begin tearing down the old process before we probe for readiness.
 INITIAL_SLEEP=3
 TIMEOUT=120
 PING_TIMEOUT=5
 sleep "$INITIAL_SLEEP"
 elapsed="$INITIAL_SLEEP"
 while [ "$elapsed" -lt "$TIMEOUT" ]; do
  if systemctl is-active --quiet salt-minion \
     && salt-call --local --timeout="$PING_TIMEOUT" --out=quiet test.ping >/dev/null 2>&1; then
    echo "salt-minion ready after ${elapsed}s"
    exit 0
  fi
  sleep 1
  elapsed=$((elapsed + 1))
 done
 echo "salt-minion did not become ready within ${TIMEOUT}s" >&2
 exit 1
@@ -1,10 +1,26 @@
 {% from 'vars/globals.map.jinja' import GLOBALS %}
 {% from 'global/map.jinja' import GLOBALMERGED %}
 highstate_schedule:
  schedule.present:
    - function: state.highstate
-    - minutes: 15
+    - hours: {{ GLOBALMERGED.push.highstate_interval_hours }}
    - maxrunning: 1
 {% if not GLOBALS.is_manager %}
-    - splay: 120
+    - splay: 1800
 {% endif %}
 {% if GLOBALS.is_manager and GLOBALMERGED.push.enabled %}
 push_drain_schedule:
  schedule.present:
    - function: cmd.run
    - job_args:
      - /usr/sbin/so-push-drainer
    - seconds: {{ GLOBALMERGED.push.drain_interval }}
    - maxrunning: 1
    - return_job: False
 {% elif GLOBALS.is_manager %}
 push_drain_schedule:
  schedule.absent:
    - name: push_drain_schedule
 {% endif %}
@@ -14,6 +14,7 @@ include:
 so-sensoroni:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - network_mode: host
    - binds:
      - /nsm/import:/nsm/import:rw
@@ -18,6 +18,7 @@ include:
 so-soc:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-soc:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - hostname: soc
    - name: so-soc
    - networks:
@@ -47,6 +47,10 @@ strelka_backend:
      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
    {%   endfor %}
    {% endif %}
    # Intentionally `on-failure` (not unless-stopped) -- strelka backend shuts
    # down cleanly during rule reloads and we do not want those clean exits to
    # trigger an auto-restart. Do not homogenize; see the container
    # auto-restart section of the plan.
    - restart_policy: on-failure
    - watch:
      - file: strelkasensorcompiledrules
@@ -15,6 +15,7 @@ include:
 strelka_coordinator:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-strelka-coordinator
    - networks:
      - sobridge:
@@ -15,6 +15,7 @@ include:
 strelka_filestream:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/filestream/:/etc/strelka/:ro
      - /nsm/strelka:/nsm/strelka
@@ -15,6 +15,7 @@ include:
 strelka_frontend:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/frontend/:/etc/strelka/:ro
      - /nsm/strelka/log/:/var/log/strelka/:rw
@@ -15,6 +15,7 @@ include:
 strelka_gatekeeper:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-redis:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-strelka-gatekeeper
    - networks:
      - sobridge:
@@ -15,6 +15,7 @@ include:
 strelka_manager:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-strelka-manager:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - binds:
      - /opt/so/conf/strelka/manager/:/etc/strelka/:ro
      {% if DOCKERMERGED.containers['so-strelka-manager'].custom_bind_mounts %}
@@ -18,6 +18,7 @@ so-suricata:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-suricata:{{ GLOBALS.so_version }}
    - privileged: True
    - restart_policy: unless-stopped
    - environment:
      - INTERFACE={{ GLOBALS.sensor.interface }}
      {% if DOCKERMERGED.containers['so-suricata'].extra_env %}
@@ -7,6 +7,7 @@ so-tcpreplay:
  docker_container.running:
    - network_mode: "host"
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-tcpreplay:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - name: so-tcpreplay
    - user: root
    - interactive: True
@@ -18,6 +18,7 @@ include:
 so-telegraf:
  docker_container.running:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-telegraf:{{ GLOBALS.so_version }}
    - restart_policy: unless-stopped
    - user: 939
    - group_add: 939,920
    - environment:
@@ -18,6 +18,7 @@ so-zeek:
    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-zeek:{{ GLOBALS.so_version }}
    - start: True
    - privileged: True
    - restart_policy: unless-stopped
    {% if DOCKERMERGED.containers['so-zeek'].ulimits %}
    - ulimits:
    {%   for ULIMIT in DOCKERMERGED.containers['so-zeek'].ulimits %}
@@ -223,8 +223,6 @@ if [ -n "$test_profile" ]; then
 	WEBPASSWD1=0n10nus3r
 	WEBPASSWD2=0n10nus3r
 	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MSRVIP_OFFSET}"
 	# opt out of telemetry for automated testing
 	telemetry=1
 	update_sudoers_for_testing
 fi
Author	SHA1	Message	Date
Josh Patterson	33a116357d	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-06-10 08:56:17 -04:00
Josh Patterson	8c17ae0f66	move so-salt-minion-wait	2026-06-01 14:48:54 -04:00
Josh Patterson	f54939b444	Replace inotify pillar watch with postgres audit_settings beacon The active-push feature detected pillar/settings changes via an inotify beacon on the manager watching /opt/so/saltstack/local/pillar. Replace that pillar watch with a custom salt beacon (pillar_db) that polls the SOC so_soc.audit_settings table on a monotonic id watermark, so changes made through SOC drive immediate pushes from the database instead of the files. The suricata/strelka rule inotify watches (and pyinotify) are kept unchanged, since rule-file edits are not recorded in audit_settings. - salt/_beacons/pillar_db.py: new beacon. Polls audit_settings via `docker exec so-postgres psql` (unix-socket trust auth), tracks the last processed id in /opt/so/state/pillar_db_watch.id, seeds to MAX(id) on first run (no history replay), and emits one event per new row. - salt/reactor/push_pillar.sls: consume setting_id/node_id from the beacon event instead of a file path. App = first dotted segment of setting_id, looked up in pillar_push_map.yaml. Empty node_id -> grid-wide actions as is; populated node_id -> the app's state(s) retargeted to that one node. - salt/manager/files/beacons_pushstate.conf.jinja: drop the pillar inotify block, add the pillar_db beacon (interval = push.drain_interval); keep the suricata/strelka inotify watches. - salt/salt/files/reactor_pushstate.conf: map salt/beacon/*/pillar_db/ audit_settings to push_pillar.sls; remove the pillar inotify reactor lines; keep suricata/strelka. The intent -> so-push-drainer -> orch.push_batch pipeline is unchanged. Verified end-to-end on a standalone: a grid-wide telegraf.output change re-applied telegraf fleetwide (container replaced), and a per-host ntp.config.servers change applied ntp to only that node.	2026-05-29 14:55:13 -04:00
Josh Patterson	d48a22e37e	Merge pull request #15944 from Security-Onion-Solutions/jertel/wip Jertel/wip	2026-05-28 14:01:42 -04:00
Josh Patterson	6393d08e86	merge	2026-05-27 08:59:28 -04:00
Josh Patterson	730c828bec	Merge remote-tracking branch 'origin/jertel/wip' into saltthangs	2026-05-19 10:23:45 -04:00
Josh Patterson	b4e5171415	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-14 08:03:45 -04:00
Josh Patterson	84decc1db6	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-13 14:09:15 -04:00
Josh Patterson	7d4d6a0756	prune images if so-docker-prune exists	2026-05-08 10:13:15 -04:00
Josh Patterson	66c0a662fc	convert wait to script	2026-05-08 09:26:42 -04:00
Josh Patterson	778cc055ea	wait for salt-minion service to be ready before finishing state run	2026-05-07 17:01:20 -04:00
Josh Patterson	932deab751	update the push map	2026-05-07 10:51:53 -04:00
Josh Patterson	1281f0ee37	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-06 09:46:12 -04:00
Josh Patterson	f774334b6c	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-05-06 08:16:41 -04:00
Josh Patterson	7fcace34c4	add sensoroni to push map	2026-04-30 16:09:08 -04:00
Josh Patterson	9541024eb7	fix broken things	2026-04-30 15:35:24 -04:00
Josh Patterson	0d166ef732	remove trailing slashes	2026-04-30 09:53:00 -04:00
Josh Patterson	f7d2994f8b	filter temp files	2026-04-30 09:16:22 -04:00
Josh Patterson	8f0757606d	include salt..minion	2026-04-29 16:42:19 -04:00
Josh Patterson	0a8f2e01a0	install pyinotify	2026-04-29 16:41:56 -04:00
Josh Patterson	4546d7bc52	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-29 14:28:19 -04:00
Josh Patterson	17849d8758	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 15:49:22 -04:00
Josh Patterson	d3d30a587c	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 15:30:31 -04:00
Josh Patterson	034711d148	Merge remote-tracking branch 'origin/3/dev' into saltthangs	2026-04-28 10:47:29 -04:00
Mike Reeves	a0cf0489d6	reduce highstate frequency with active push for rules and pillars - schedule highstate every 2 hours (was 15 minutes); interval lives in global:push:highstate_interval_hours so the SOC admin UI can tune it and so-salt-minion-check derives its threshold as (interval + 1) * 3600 - add inotify beacon on the manager + master reactor + orch.push_batch that writes per-app intent files, with a so-push-drainer schedule on the manager that debounces, dedupes, and dispatches a single orchestration - pillar_push_map.yaml allowlists the apps whose pillar changes trigger an immediate targeted state.apply (targets verified against salt/top.sls); edits under pillar/minions/ trigger a state.highstate on that one minion - host-batch every push orchestration (batch: 25%, batch_wait: 15) so rule changes don't thundering-herd large fleets - new global:push:enabled kill-switch tears down the beacon, reactor config, and drainer schedule on the next highstate for operators who want to keep highstate-only behavior - set restart_policy: unless-stopped on 23 container states so docker recovers crashes without waiting for the next highstate; leave registry (always), strelka/backend (on-failure), kratos, and hydra alone with inline comments explaining why	2026-04-10 15:43:16 -04:00
Jason Ertel	613d31c8a6	merge	2026-03-05 11:52:09 -05:00