Pin NIC names by MAC via udev (run-once) from the common state

Add so-nic-pin, which writes by-MAC persistent-net udev rules pinning each physical NIC to its current name so a kernel upgrade can't renumber the interfaces Security Onion binds by name (host:mainint, sensor:mainint, bond0). Gated by the drop file /opt/so/state/nic_names_pinned: run-once on highstate, and an admin can pre-create the marker to opt out. Wired into common/init.sls as pin_nic_names, guarded by a matching unless.
Merge pull request #15966 from Security-Onion-Solutions/reyesj2-patch-8
2026-06-12 21:29:16 +02:00 · 2026-06-11 18:40:43 -04:00 · 2026-06-11 14:36:03 -05:00 · 2026-06-11 08:22:14 -04:00 · 2026-06-10 17:52:08 -05:00 · 2026-06-10 17:27:23 -05:00
20 changed files with 458 additions and 34 deletions
@@ -130,6 +130,17 @@ common_sbin:
      - so-pcap-import
 {% endif %}
 # Pin physical NIC names by MAC (run-once) so a kernel upgrade can't renumber the
 # interfaces SO binds by name. The marker keeps it a one-time setup; an admin can
 # pre-create the marker to opt out.
 pin_nic_names:
  cmd.run:
    - name: /usr/sbin/so-nic-pin
    - unless: 'test -e /opt/so/state/nic_names_pinned'
    - require:
      - file: common_sbin
      - file: statedir
 common_sbin_jinja:
  file.recurse:
    - name: /usr/sbin
@@ -0,0 +1,76 @@
 #!/bin/bash
 #
 # so-nic-pin — pin physical NIC names by permanent MAC via classic by-MAC udev
 #              rules, so a kernel upgrade can't renumber them.
 #
 # Security Onion binds its management and monitor interfaces BY NAME in pillar
 # (host:mainint, sensor:mainint, and bond0 is built on a specific physical NIC).
 # A kernel upgrade can change the kernel/systemd-udevd predictable-naming output
 # and renumber those NICs (e.g. enp1s0 -> enp2s0), which breaks the grid: the
 # pillar references a name that no longer exists and bond/bridge bring-up fails.
 #
 # This writes /etc/udev/rules.d/70-persistent-net.rules pinning each PHYSICAL NIC
 # to its CURRENT name by its PERMANENT MAC, freezing the names across future kernel
 # changes. It only writes the rules file; it does NOT live-trigger a rename (the
 # rules apply on the next boot/kernel, and a live rename would be disruptive).
 #
 # Run-once: gated by the drop file /opt/so/state/nic_names_pinned. If the marker is
 # present the script does nothing, so an admin can pre-create it to opt out. Invoked
 # from the common state on every highstate; the marker keeps it a one-time setup.
 NET_RULES_FILE="/etc/udev/rules.d/70-persistent-net.rules"
 MARKER="/opt/so/state/nic_names_pinned"
 log() { echo -e "[so-nic-pin] $*"; }
 # Echo "<name> <permanent-mac>" for every PHYSICAL NIC. A physical NIC is backed by a
 # real device (has device/driver), which excludes bond0/sobridge/docker0/veth*/lo whose
 # MACs are dynamic and must never be pinned. The PERMANENT MAC is used (ethtool -P, with
 # fallbacks), not the current one: an enslaved bond member's current MAC is rewritten to
 # the bond's, so matching on it would be wrong/ambiguous.
 physical_nics() {
    local path n mac
    for path in /sys/class/net/*; do
        n="${path##*/}"
        [ "$n" = "lo" ] && continue
        [ -e "${path}/device/driver" ] || continue          # real device only
        mac="$(ethtool -P "$n" 2>/dev/null | awk '/Permanent address/{print $NF}')"
        case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/bonding_slave/perm_hwaddr" 2>/dev/null)" ;; esac
        case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/address" 2>/dev/null)" ;; esac
        case "$mac" in ""|00:00:00:00:00:00) continue ;; esac
        echo "$n $mac"
    done
 }
 # Turn "<name> <mac>" lines on stdin into classic by-MAC persistent-net udev rules.
 render_net_rules() {
    echo "# Generated by so-nic-pin: pin NIC names by MAC so kernel upgrades can't renumber them."
    echo "# Security Onion binds its management/monitor interfaces by name; do not hand-edit."
    local n mac
    while read -r n mac; do
        [ -n "$n" ] || continue
        printf 'SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", ATTR{address}=="%s", NAME="%s"\n' \
            "$mac" "$n"
    done
 }
 [ "$(id -u)" -eq 0 ] || exit 0                   # salt runs us as root; bail quietly otherwise
 [ -e "${MARKER}" ] && exit 0                      # run-once guard (mirrors the state's unless)
 nics="$(physical_nics)"
 if [ -z "${nics}" ]; then
    log "no physical NICs detected — nothing to pin (will retry on next highstate)"
    exit 0                                         # do NOT drop the marker; let it retry later
 fi
 log "pinning physical NICs by permanent MAC:"
 echo "${nics}" | sed 's/^/    /'
 [ -f "${NET_RULES_FILE}" ] && cp -f "${NET_RULES_FILE}" "${NET_RULES_FILE}.bak"
 echo "${nics}" | render_net_rules > "${NET_RULES_FILE}" || {
    log "ERROR: failed to write ${NET_RULES_FILE}"
    exit 1
 }
 mkdir -p "$(dirname "${MARKER}")" && touch "${MARKER}"
 log "wrote ${NET_RULES_FILE} ($(grep -c '^SUBSYSTEM' "${NET_RULES_FILE}") NIC(s) pinned); dropped ${MARKER}"
@@ -101,6 +101,17 @@ so-elastic-fleet:
      - file: trusttheca
      - x509: etc_elasticfleet_key
      - x509: etc_elasticfleet_crt
 wait_for_so-elastic-fleet:
  http.wait_for_successful_query:
    - name: "https://localhost:8220/api/status"
    - ssl: True
    - verify_ssl: False
    - status: 200
    - wait_for: 300
    - request_interval: 15
    - require:
      - docker_container: so-elastic-fleet
 {%   endif %}
 delete_so-elastic-fleet_so-status.disabled:
@@ -9,16 +9,20 @@
 include:
  - elasticfleet.config
  - kibana.enabled
 # If enabled, automatically update Fleet Logstash Outputs
-{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval'] %}
+{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration %}
 {%   if grains.role not in ['so-import', 'so-eval']%}
 so-elastic-fleet-auto-configure-logstash-outputs:
  cmd.run:
    - name: /usr/sbin/so-elastic-fleet-outputs-update
    - retry:
        attempts: 4
        interval: 30
-{% endif %}
+    - require:
      - http: wait_for_so-kibana
 {%   endif %}
 # If enabled, automatically update Fleet Server URLs & ES Connection
 so-elastic-fleet-auto-configure-server-urls:
@@ -27,6 +31,9 @@ so-elastic-fleet-auto-configure-server-urls:
    - retry:
        attempts: 4
        interval: 30
    - require:
      - http: wait_for_so-kibana
 {% endif %}
 # Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
 so-elastic-fleet-auto-configure-elasticsearch-urls:
@@ -35,6 +42,8 @@ so-elastic-fleet-auto-configure-elasticsearch-urls:
    - retry:
        attempts: 4
        interval: 30
    - require:
      - http: wait_for_so-kibana
 so-elastic-fleet-auto-configure-artifact-urls:
  cmd.run:
@@ -42,6 +51,8 @@ so-elastic-fleet-auto-configure-artifact-urls:
    - retry:
        attempts: 4
        interval: 30
    - require:
      - http: wait_for_so-kibana
 so-elastic-fleet-package-statefile:
  file.managed:
@@ -53,7 +64,9 @@ so-elastic-fleet-package-upgrade:
    - name: /usr/sbin/so-elastic-fleet-package-upgrade
    - retry:
        attempts: 3
-        interval: 10
+        interval: 30
    - require:
      - http: wait_for_so-kibana
    - onchanges:
      - file: /opt/so/state/elastic_fleet_packages.txt
@@ -63,6 +76,8 @@ so-elastic-fleet-integrations:
    - retry:
        attempts: 3
        interval: 10
    - require:
      - http: wait_for_so-kibana
 so-elastic-agent-grid-upgrade:
  cmd.run:
@@ -70,6 +85,8 @@ so-elastic-agent-grid-upgrade:
    - retry:
        attempts: 12
        interval: 5
    - require:
      - http: wait_for_so-kibana
 so-elastic-fleet-integration-upgrade:
  cmd.run:
@@ -77,16 +94,22 @@ so-elastic-fleet-integration-upgrade:
    - retry:
        attempts: 3
        interval: 10
    - require:
      - http: wait_for_so-kibana
 {# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
 so-elastic-fleet-addon-integrations:
  cmd.run:
    - name: /usr/sbin/so-elastic-fleet-optional-integrations-load
    - require:
      - http: wait_for_so-kibana
 {% if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
 so-elastic-defend-manage-filters-file-watch:
  cmd.run:
    - name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
    - require:
      - http: wait_for_so-kibana
    - onchanges:
      - file: elasticdefendcustom
      - file: elasticdefenddisabled
@@ -108,9 +108,12 @@ if [ ! -f /opt/so/state/eaintegrations.txt ]; then
  done
  # Only create the state file if all policies were created/updated successfully
-  if [[ "$RETURN_CODE" != "1" ]]; then
+  if [[ $RETURN_CODE -eq 0 ]]; then
    touch /opt/so/state/eaintegrations.txt
  else
    exit 1
  fi
 else
-  exit $RETURN_CODE
+  echo "Fleet integration policies already loaded."
  exit 0
 fi
@@ -8,18 +8,33 @@
 . /usr/sbin/so-elastic-fleet-common
 PKG_LOAD_FAILURES=0
 PKG_LOAD_FAILURES_NAMES=()
 {%- for PACKAGE in SUPPORTED_PACKAGES %}
 echo "Upgrading {{ PACKAGE }} package..."
 if VERSION=$(elastic_fleet_package_latest_version_check "{{ PACKAGE }}"); then
    if ! elastic_fleet_package_install "{{ PACKAGE }}" "$VERSION"; then
-        # exit 1 on failure to upgrade a default package, allow salt to handle retries
+        PKG_LOAD_FAILURES=$((PKG_LOAD_FAILURES + 1))
-        echo -e "\nERROR: Failed to upgrade $PACKAGE to version: $VERSION"
+        PKG_LOAD_FAILURES_NAMES+=("{{ PACKAGE }}")
        exit 1
    fi
 else
-    echo -e "\nERROR: Failed to get version information for integration $PACKAGE"
+    PKG_LOAD_FAILURES=$((PKG_LOAD_FAILURES + 1))
    PKG_LOAD_FAILURES_NAMES+=("{{ PACKAGE }}")
 fi
 echo
 {%- endfor %}
 if [ $PKG_LOAD_FAILURES -gt 0 ]; then
    echo "ERROR: Failed to upgrade $PKG_LOAD_FAILURES package(s):"
    for PKG in "${PKG_LOAD_FAILURES_NAMES[@]}"; do
        echo " - $PKG"
    done
    # exit 1 on failure to upgrade a default package, allow salt to handle retries
    exit 1
 else
    echo "Successfully upgraded all packages."
 fi
 echo
 /usr/sbin/so-elasticsearch-templates-load
@@ -9,9 +9,12 @@
 {%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS, SO_MANAGED_INDICES %}
 {%   if GLOBALS.role != 'so-heavynode' %}
-{%     from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS %}
+{%     from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS, ADDON_INDICES %}
 {%   endif %}
 include:
  - elasticsearch.enabled
 escomponenttemplates:
  file.recurse:
    - name: /opt/so/conf/elasticsearch/templates/component
@@ -35,6 +38,20 @@ so_index_template_dir:
      {%- endfor %}
    {%- endif %}
 {%  if GLOBALS.role != "so-heavynode" %}
 # Clean up legacy and non-SO managed templates from the elasticsearch/templates/addon-index/ directory
 addon_index_template_dir:
  file.directory:
    - name: /opt/so/conf/elasticsearch/templates/addon-index
    - clean: True
    {%- if ADDON_INDICES %}
    - require:
      {%- for index in ADDON_INDICES %}
      - file: addon_index_template_{{index}}
      {%- endfor %}
    {%- endif %}
 {%  endif %}
 # Auto-generate index templates for SO managed indices (directly defined in elasticsearch/defaults.yaml)
 #   These index templates are for the core SO datasets and are always required
 {%  for index, settings in ES_INDEX_SETTINGS.items() %}
@@ -61,15 +61,25 @@
 {% if ALL_ADDON_SETTINGS_ORIG.keys() | length > 0 %}
 {%   for index in ALL_ADDON_SETTINGS_ORIG.keys() %}
 {%     do ALL_ADDON_SETTINGS_GLOBAL_OVERRIDES.update({index: salt['defaults.merge'](ALL_ADDON_SETTINGS_ORIG[index], PILLAR_GLOBAL_OVERRIDES, in_place=False)}) %}
 {#     Explicitly excluding addon indices from ES_INDEX_SETTINGS_ORIG
         When manager.soc_managed_annotations runs, new entries are added to the salt/elasticsearch/defaults.yaml file to support 'revert to default' functionality.
         Subsequent map renders will then incorrectly include 'integration X' in 'ES_INDEX_SETTINGS_ORIG' due to being in the defaults.yaml file. #}
 {%     if index in ES_INDEX_SETTINGS_ORIG.keys() %}
 {%       do ES_INDEX_SETTINGS_ORIG.pop(index) %}
 {%     endif %}
 {%   endfor %}
 {% endif %}
 {% set ES_INDEX_SETTINGS = {} %}
-{% macro create_final_index_template(DEFINED_SETTINGS, GLOBAL_OVERRIDES, FINAL_INDEX_SETTINGS) %}
+{% macro create_final_index_template(DEFINED_SETTINGS, GLOBAL_OVERRIDES, FINAL_INDEX_SETTINGS, EXCLUDE_INDICES=[]) %}
 {% do GLOBAL_OVERRIDES.update(salt['defaults.merge'](GLOBAL_OVERRIDES, ES_INDEX_PILLAR, in_place=False)) %}
 {% for index, settings in GLOBAL_OVERRIDES.items() %}
 {%   if index in EXCLUDE_INDICES %}
 {%     continue %}
 {%   endif %}
 {#   prevent this action from being performed on custom defined indices. #}
 {#   the custom defined index is not present in either of the dictionaries and fails to reder. #}
 {%   if index in DEFINED_SETTINGS and index in GLOBAL_OVERRIDES %}
@@ -150,10 +160,19 @@
 {% endfor %}
 {% endmacro %}
-{{ create_final_index_template(ES_INDEX_SETTINGS_ORIG, ES_INDEX_SETTINGS_GLOBAL_OVERRIDES, ES_INDEX_SETTINGS) }}
+{# Exclude addon integrations from final ES_INDEX_SETTINGS #}
-{{ create_final_index_template(ALL_ADDON_SETTINGS_ORIG, ALL_ADDON_SETTINGS_GLOBAL_OVERRIDES, ALL_ADDON_SETTINGS) }}
+{{ create_final_index_template(ES_INDEX_SETTINGS_ORIG, ES_INDEX_SETTINGS_GLOBAL_OVERRIDES, ES_INDEX_SETTINGS, ALL_ADDON_SETTINGS_ORIG.keys() | list ) }}
 {# Exclude SO managed indices, otherwise ALL_ADDON_SETTINGS will include pillar values
  of core integrations without merging defaults, resulting in an overlapping, but bad index template being generated. #}
 {{ create_final_index_template(ALL_ADDON_SETTINGS_ORIG, ALL_ADDON_SETTINGS_GLOBAL_OVERRIDES, ALL_ADDON_SETTINGS, ES_INDEX_SETTINGS_ORIG.keys() | list ) }}
 {% set SO_MANAGED_INDICES = [] %}
 {% for index, settings in ES_INDEX_SETTINGS.items() %}
 {%   do SO_MANAGED_INDICES.append(index) %}
 {% endfor %}
 {% set ADDON_INDICES = [] %}
 {% for index, settings in ALL_ADDON_SETTINGS.items() %}
 {%   do ADDON_INDICES.append(index) %}
 {% endfor %}
@@ -6,6 +6,7 @@
 {% from 'allowed_states.map.jinja' import allowed_states %}
 {% if sls.split('.')[0] in allowed_states %}
 {%   from 'docker/docker.map.jinja' import DOCKERMERGED %}
 {%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'vars/globals.map.jinja' import GLOBALS %}
 include:
@@ -60,6 +61,19 @@ so-kibana:
    - watch:
      - file: kibanaconfig
 wait_for_so-kibana:
  http.wait_for_successful_query:
    - name: "http://localhost:5601/api/status"
    - username: 'so_elastic'
    - password: '{{ ELASTICSEARCHMERGED.auth.users.so_elastic_user.pass }}'
    - ssl: True
    - verify_ssl: False
    - status: 200
    - wait_for: 300
    - request_interval: 15
    - require:
      - docker_container: so-kibana
 delete_so-kibana_so-status.disabled:
  file.uncomment:
    - name: /opt/so/conf/so-status/so-status.conf
@@ -31,11 +31,13 @@ sync_es_users:
      - http: wait_for_kratos
      - file: so-user.lock # require so-user.lock file to be missing
-# we dont want this added too early in setup, so we add the onlyif to verify 'startup_states: highstate'
+# we dont want this added too early in setup, so the onlyif gates on the
-# is in the minion config. That line is added before the final highstate during setup
+# /opt/so/state/setup-complete marker. The marker is written by
 # mark_setup_complete in setup/so-functions just before the final setup
 # highstate (and by an upgrade-path state for systems set up under the old gate).
 so-user_sync:
  cron.present:
    - user: root
    - name: 'PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/root/bin /usr/sbin/so-user sync &>> /opt/so/log/soc/sync.log'
    - identifier: so-user_sync
-    - onlyif: "grep -x 'startup_states: highstate' /etc/salt/minion"
+    - onlyif: "test -e /opt/so/state/setup-complete"
@@ -0,0 +1,117 @@
 #!/bin/bash
 #
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 # Runs once per boot on managers (via so-boot-mine-update.service), before
 # so-boot-highstate.service. Waits for the responsive minion set to settle, pushes
 # mine.update, waits until every up minion has actually reported to the mine, then
 # warms the master's per-minion pillar cache so the mine-backed node pillars (node
 # IPs, ES/Redis/Logstash/hypervisor discovery -- some glob- and some pillar/grain-
 # targeted) are complete before the boot highstate renders them. Otherwise a node
 # that is up but not yet fully reported gets dropped from those pillars and torn
 # out of the configs they build (e.g. so-elasticsearch ExtraHosts -> container recreate).
 MAX_WAIT=${MINE_UPDATE_MAX_WAIT:-180}   # hard backstop only
 INTERVAL=10
 STABLE_CHECKS=3                          # up-count must hold steady this many polls
 elapsed=0
 prev=-1
 stable=0
 up=0
 # Wait for the *reachable* minion set to settle rather than for every accepted
 # key to report up: an operator may accept a minion's key and then intentionally
 # power off that host, so requiring up >= accepted would never be satisfied and
 # we'd always burn the full MAX_WAIT. Once the responsive count stops growing we
 # stop waiting and run mine.update against whoever is up.
 while [ "$elapsed" -lt "$MAX_WAIT" ]; do
  up=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null \
    | python3 -c 'import sys,json; print(len(json.load(sys.stdin)))' 2>/dev/null)
  up=${up:-0}
  if [ "$up" -gt 0 ] && [ "$up" -eq "$prev" ]; then
    stable=$((stable + 1))
    [ "$stable" -ge "$STABLE_CHECKS" ] && break
  else
    stable=0
  fi
  prev=$up
  sleep "$INTERVAL"
  elapsed=$((elapsed + INTERVAL))
 done
 echo "so-boot-mine-update: ${up} minions up (settled after ${elapsed}s); running mine.update"
 /usr/bin/salt '*' mine.update --out=txt
 # A node that is up but has not yet re-reported network.ip_addrs to the mine is
 # silently dropped from mine-backed pillars (elasticsearch:nodes, node_data, ...)
 # when highstate recompiles them -- which e.g. removes it from so-elasticsearch
 # ExtraHosts and forces a container recreate. After the broad mine.update above,
 # wait until every up minion actually has network.ip_addrs in the mine, re-pushing
 # mine.update to stragglers, before releasing the boot highstate. Bounded by the
 # same MAX_WAIT backstop so a slow/down node never blocks boot indefinitely.
 missing=""
 while [ "$elapsed" -lt "$MAX_WAIT" ]; do
  up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
  mine_json=$(/usr/bin/salt-run mine.get '*' network.ip_addrs tgt_type=glob --out=json 2>/dev/null)
  missing=$(printf '%s' "$up_json" | python3 -c '
 import sys, json
 up = set(json.load(sys.stdin) or [])
 mine = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
 print("\n".join(sorted(up - mine)))
 ' "$mine_json" 2>/dev/null)
  if [ -z "$missing" ]; then
    echo "so-boot-mine-update: mine complete for all up minions after ${elapsed}s"
    break
  fi
  echo "so-boot-mine-update: mine missing up minion(s): $(echo $missing); re-running mine.update"
  for m in $missing; do /usr/bin/salt "$m" mine.update --out=txt; done
  sleep "$INTERVAL"
  elapsed=$((elapsed + INTERVAL))
 done
 [ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; up minion(s) still absent from mine: $(echo $missing); highstate may drop them from configs"
 # The pillar/compound-targeted node pillars (elasticsearch:nodes, redis:nodes,
 # logstash:nodes, hypervisor:nodes) resolve their target against the master's
 # per-minion data cache (grains+pillar in .../minions/<id>/data.p), populated only
 # when a minion's pillar is (re)compiled -- separately from the mine. A freshly
 # booted node can be in the mine (glob/node_data sees it) yet absent from that
 # cache, so it is dropped from those pillars and from the configs they build (e.g.
 # so-elasticsearch ExtraHosts). Force a synchronous pillar refresh so the master
 # caches every up node's pillar; refresh_pillar wait=True returns only once the
 # pillar is recompiled (and thus cached for matching). Retry stragglers <= MAX_WAIT.
 echo "so-boot-mine-update: warming master pillar cache for pillar/grain-targeted node pillars"
 /usr/bin/salt '*' saltutil.refresh_pillar wait=True --out=txt
 missing=""
 while [ "$elapsed" -lt "$MAX_WAIT" ]; do
  up_json=$(/usr/bin/salt-run manage.up --out=json 2>/dev/null)
  cached_json=$(/usr/bin/salt-run cache.pillar tgt='*' --out=json 2>/dev/null)
  missing=$(printf '%s' "$up_json" | python3 -c '
 import sys, json
 up = set(json.load(sys.stdin) or [])
 cached = {k for k, v in (json.loads(sys.argv[1]) or {}).items() if v}
 print("\n".join(sorted(up - cached)))
 ' "$cached_json" 2>/dev/null)
  if [ -z "$missing" ]; then
    echo "so-boot-mine-update: pillar cache warm for all up minions after ${elapsed}s"
    break
  fi
  echo "so-boot-mine-update: pillar not yet cached for: $(echo $missing); refreshing"
  for m in $missing; do /usr/bin/salt "$m" saltutil.refresh_pillar wait=True --out=txt; done
  sleep "$INTERVAL"
  elapsed=$((elapsed + INTERVAL))
 done
 [ -n "$missing" ] && echo "so-boot-mine-update: WARNING ${MAX_WAIT}s backstop hit; pillar not cached for: $(echo $missing); pillar-targeted pillars may drop them"
 # Log what the mine-backed pillars render so the boot-time state is inspectable.
 /usr/bin/salt-call saltutil.refresh_pillar >/dev/null 2>&1
 sleep 2
 for key in node_data elasticsearch:nodes; do
  rendered=$(/usr/bin/salt-call --out=json pillar.get "$key" 2>/dev/null \
    | python3 -c 'import sys,json; print(json.dumps(json.load(sys.stdin).get("local"), indent=2, sort_keys=True))' 2>/dev/null)
  echo "so-boot-mine-update: ${key} rendered as:"
  echo "${rendered:-null}"
 done
 exit 0
@@ -14,6 +14,7 @@
 include:
  - salt.minion
  - salt.master.boot_mine_update
 {%   if 'vrt' in salt['pillar.get']('features', []) %}
  - salt.cloud
  - salt.cloud.reactor_config_hypervisor
@@ -0,0 +1,29 @@
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 # Manages /etc/systemd/system/so-boot-mine-update.service, a manager-only
 # Type=oneshot unit that pushes `salt '*' mine.update` once per boot, ordered
 # before so-boot-highstate.service so mine-backed pillars (node IPs, ES/Redis/
 # Logstash discovery) are fresh before the boot highstate renders them.
 include:
  - systemd.reload
 so_boot_mine_update_unit_file:
  file.managed:
    - name: /etc/systemd/system/so-boot-mine-update.service
    - source: salt://salt/service/so-boot-mine-update.service
    - onchanges_in:
      - module: systemd_reload
 # Only enable once setup is complete. Until then the gate file is missing and
 # the unit's own ConditionPathExists would no-op it anyway.
 so_boot_mine_update_service:
  service.enabled:
    - name: so-boot-mine-update.service
    - onlyif: test -e /opt/so/state/setup-complete
    - require:
      - file: so_boot_mine_update_unit_file
      - module: systemd_reload
@@ -0,0 +1,31 @@
 # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
 # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.
 # Manages /etc/systemd/system/so-boot-highstate.service, a Type=oneshot
 # RemainAfterExit=yes unit that runs `salt-call state.highstate` exactly once
 # per system boot. Replaces the legacy `startup_states: highstate` minion
 # config, which fired on every salt-minion service restart (causing a redundant
 # highstate whenever a highstate itself restarted salt-minion).
 include:
  - systemd.reload
 so_boot_highstate_unit_file:
  file.managed:
    - name: /etc/systemd/system/so-boot-highstate.service
    - source: salt://salt/service/so-boot-highstate.service
    - onchanges_in:
      - module: systemd_reload
 # Only enable once setup is complete. Until then the gate file is missing and
 # the unit's own ConditionPathExists would no-op it anyway -- this just keeps
 # `systemctl is-enabled` honest for the sync_es_users gate.
 so_boot_highstate_service:
  service.enabled:
    - name: so-boot-highstate.service
    - onlyif: test -e /opt/so/state/setup-complete
    - require:
      - file: so_boot_highstate_unit_file
      - module: systemd_reload
@@ -17,6 +17,7 @@ include:
  - repo.client
  - salt.mine_functions
  - salt.minion.service_file
  - salt.minion.boot_highstate
 {% if GLOBALS.is_manager %}
  - ca.signing_policy
 {% endif %}
@@ -80,11 +81,33 @@ set_log_levels:
      - "log_level: info"
      - "log_level_logfile: info"
-enable_startup_states:
+# startup_states: highstate caused a full highstate to run on every
-  file.uncomment:
+# salt-minion service start, including the restart triggered when a highstate
 # itself modified the minion config (beacons, mine, unit file). Replaced by
 # so-boot-highstate.service (managed in salt.minion.boot_highstate), which
 # runs once per system boot only. Strip the line from /etc/salt/minion on
 # upgrade; both the commented and uncommented forms historically existed.
 remove_startup_states:
  file.line:
    - name: /etc/salt/minion
-    - regex: '^startup_states: highstate$'
+    - match: 'startup_states: highstate'
-    - unless: pgrep so-setup
+    - mode: delete
 # Upgrade-path bridge: systems that already passed setup under the old gate
 # (`grep -x 'startup_states: highstate' /etc/salt/minion`) get a /opt/so/state/setup-complete
 # marker so so-boot-highstate.service can be enabled and the so-user_sync cron
 # in sync_es_users.sls keeps installing. Setup-in-progress systems instead get
 # the marker from `mark_setup_complete` in setup/so-functions at the right
 # moment. `replace: false` means we never overwrite a marker once written.
 mark_setup_complete_for_upgrades:
  file.managed:
    - name: /opt/so/state/setup-complete
    - replace: false
    - makedirs: True
    - onlyif: "grep -qx 'startup_states: highstate' /etc/salt/minion"
    - require_in:
      - file: remove_startup_states
      - service: so_boot_highstate_service
 {% endif %}
@@ -0,0 +1,14 @@
 [Unit]
 Description=Security Onion boot-time highstate (runs once per boot)
 After=salt-minion.service network-online.target docker.service
 Wants=network-online.target docker.service
 Requires=salt-minion.service
 ConditionPathExists=/opt/so/state/setup-complete
 [Service]
 Type=oneshot
 RemainAfterExit=yes
 ExecStart=/usr/bin/salt-call state.highstate -l info queue=True
 [Install]
 WantedBy=multi-user.target
@@ -0,0 +1,15 @@
 [Unit]
 Description=Security Onion boot-time grid mine.update (managers, runs once per boot before highstate)
 After=salt-master.service salt-minion.service network-online.target
 Wants=network-online.target
 Requires=salt-master.service salt-minion.service
 Before=so-boot-highstate.service
 ConditionPathExists=/opt/so/state/setup-complete
 [Service]
 Type=oneshot
 RemainAfterExit=yes
 ExecStart=/usr/sbin/so-boot-mine-update
 [Install]
 WantedBy=multi-user.target
@@ -8,11 +8,6 @@ set_role_grain:
    - name: role
    - value: so-{{ grains.id.split("_") | last }}
 set_highstate:
  file.append:
    - name: /etc/salt/minion
    - text: 'startup_states: highstate'
 enable_salt_minion:
  service.enabled:
    - name: salt-minion
@@ -539,16 +539,19 @@ configure_minion() {
 		"  x509_v2: true"\
 		"log_level: info"\
 		"log_level_logfile: info"\
-		"log_file: /opt/so/log/salt/minion"\
+		"log_file: /opt/so/log/salt/minion" >> "$minion_config"
 		"#startup_states: highstate" >> "$minion_config"
 }
-checkin_at_boot() {
+mark_setup_complete() {
-	local minion_config=/etc/salt/minion
+	# Writes the setup-complete marker. Salt's so-boot-highstate.service
 	# (boot-time oneshot) and the so-user_sync cron gate in
 	# salt/manager/sync_es_users.sls both key off this file.
 	local marker=/opt/so/state/setup-complete
-	info "Enabling checkin at boot"
+	info "Marking setup as complete"
-	sed -i 's/#startup_states: highstate/startup_states: highstate/' "$minion_config"
+	mkdir -p "$(dirname "$marker")"
 	touch "$marker"
 }
 check_requirements() {
@@ -977,6 +980,8 @@ docker_seed_registry() {
 		docker_seed_update_percent=25
 		update_docker_containers 'netinstall' '' 'docker_seed_update' '/dev/stdout' 2>&1 | tee -a "$setup_log"
        # Use pipe exit status of 'update_docker_containers' for return code
 		return ${PIPESTATUS[0]}
 	fi
 }
@@ -769,7 +769,10 @@ if ! [[ -f $install_opt_file ]]; then
 		title "Applying the registry state"
 		logCmd "salt-call state.apply -l info registry"
 		title "Seeding the docker registry"
-		docker_seed_registry
+		if ! docker_seed_registry; then
 			error "Failed to seed the docker registry"
 			fail_setup
 		fi
 		title "Applying the manager state"
 		logCmd "salt-call state.apply -l info manager"
 		logCmd "salt-call state.apply influxdb -l info"
@@ -794,7 +797,7 @@ if ! [[ -f $install_opt_file ]]; then
 			error "Failed to run so-elastic-fleet-setup"
 			fail_setup
 		fi
-		checkin_at_boot
+		mark_setup_complete
 		set_initial_firewall_access
        initialize_elasticsearch_indices "so-case so-casehistory so-assistant-session so-assistant-chat"
 		# run a final highstate before enabling scheduled highstates.
Author	SHA1	Message	Date
Mike Reeves	80c39d612c	Pin NIC names by MAC via udev (run-once) from the common state Add so-nic-pin, which writes by-MAC persistent-net udev rules pinning each physical NIC to its current name so a kernel upgrade can't renumber the interfaces Security Onion binds by name (host:mainint, sensor:mainint, bond0). Gated by the drop file /opt/so/state/nic_names_pinned: run-once on highstate, and an admin can pre-create the marker to opt out. Wired into common/init.sls as pin_nic_names, guarded by a matching unless.	2026-06-11 18:40:43 -04:00
Jorge Reyes	f03f0155f4	Merge pull request #15966 from Security-Onion-Solutions/reyesj2-patch-8 update so-elastic-fleet-package-upgrade script	2026-06-11 14:36:03 -05:00
Jason Ertel	0cc94980af	Merge pull request #15967 from Security-Onion-Solutions/jertel/wip Jertel/wip	2026-06-11 08:22:14 -04:00
reyesj2	4741cc92bd	fleet manager start kibana if it isn't already running and wait for healthly status	2026-06-10 17:52:08 -05:00
reyesj2	46655860e9	http	2026-06-10 17:27:23 -05:00
reyesj2	289ddda5e8	kibana health check for fleet scripts	2026-06-10 17:06:22 -05:00
reyesj2	f905afbc6f	logging	2026-06-10 15:01:22 -05:00
reyesj2	bd5e77afc5	increase delay in so-elastic-fleet-package-upgrade attempts	2026-06-10 14:59:29 -05:00
reyesj2	944e773759	save exit until all packages have been attempted	2026-06-10 14:58:49 -05:00
Josh Patterson	3ba96da3b7	Merge pull request #15965 from Security-Onion-Solutions/nostartupstates remove startup states from salt config	2026-06-09 16:26:47 -04:00
Jorge Reyes	f0712bd780	Merge pull request #15964 from Security-Onion-Solutions/reyesj2-patch-8 use pipe exit status for update_docker_containers	2026-06-09 13:49:24 -05:00
Josh Patterson	448668a72e	Merge remote-tracking branch 'origin/3/dev' into nostartupstates	2026-06-09 14:02:00 -04:00
Josh Patterson	f088a27159	so-boot-mine-update: warm master pillar cache before highstate A complete mine is not enough: elasticsearch:nodes, redis:nodes, logstash:nodes (tgt_type=pillar) and hypervisor:nodes (tgt_type=compound) resolve their target against the master's per-minion data cache (grains+pillar in data.p), which is populated only when a minion's pillar is recompiled -- separately from the mine. After a reboot a node can be in the mine (so node_data/glob sees it) yet absent from that cache, so it fails the elasticsearch:enabled:true pillar match and is dropped from elasticsearch:nodes -> so-elasticsearch ExtraHosts -> container recreate. After the mine-completeness wait, run salt '*' saltutil.refresh_pillar wait=True to synchronously cache every up node's pillar (the same lever deploy_newnode.sls uses), then verify with salt-run cache.pillar and retry stragglers, bounded by MINE_UPDATE_MAX_WAIT. Also log elasticsearch:nodes alongside node_data for inspection.	2026-06-09 13:52:19 -04:00
reyesj2	9f5a9616a5	use pipe exit status for update_docker_containers	2026-06-09 12:51:58 -05:00
Josh Patterson	27c7702325	so-boot-mine-update: wait for a complete mine before highstate Mine-backed pillars (node_data, elasticsearch:nodes, redis:nodes, logstash:nodes, hypervisor:nodes) include a node only if it returned an IP from the mine, and the configs they build are rebuilt fresh every highstate. After a manager reboot with a flushed mine, the first boot highstate could run before an up node re-reported network.ip_addrs, dropping it from e.g. so-elasticsearch ExtraHosts and forcing a container recreate. After the initial broad mine.update, poll until every currently-up minion actually has network.ip_addrs in the mine, re-pushing mine.update to stragglers, before releasing the boot highstate. Shares the existing MINE_UPDATE_MAX_WAIT backstop so a slow/down node never blocks boot, and still logs the rendered node_data for inspection.	2026-06-09 10:10:32 -04:00
Josh Patterson	8c306eb37d	so-boot-mine-update: log the rendered node_data content Dump the actual rendered node_data pillar (pretty-printed JSON) to the journal instead of just a rendered/empty verdict, so the boot-time render attempt is fully inspectable. Empty renders print false/null and still emit the WARNING.	2026-06-09 09:49:19 -04:00
Josh Patterson	e536ffa363	so-boot-mine-update: render node_data after mine.update before highstate After the boot-time mine.update, have the manager actually render the node_data pillar and log whether it came back populated. node_data: False makes salt/top.sls apply the bootstrap recovery branch instead of the manager's real config, so surfacing this in the journal makes the condition visible before so-boot-highstate runs. Best-effort and non-blocking: always exits 0 so highstate proceeds regardless.	2026-06-09 09:35:24 -04:00
Jorge Reyes	d7aa7ab228	Merge pull request #15961 from Security-Onion-Solutions/reyesj2/fleet-autoconfigure respect elasticfleet enable_auto_configuration setting for so-elastic…	2026-06-08 15:09:58 -05:00
Jorge Reyes	fe0b68d24c	Merge pull request #15958 from Security-Onion-Solutions/reyesj2-patch-template fix elasticsearch template generation issue	2026-06-08 15:07:49 -05:00
reyesj2	6ad345730b	respect elasticfleet enable_auto_configuration setting for so-elastic-fleet-urls-update	2026-06-08 15:02:57 -05:00
Josh Patterson	9580976ba2	Add manager boot-time grid mine.update oneshot before highstate so-boot-mine-update.service is a manager-only Type=oneshot unit that runs once per boot after salt-master/salt-minion start and before so-boot-highstate.service. It pushes mine.update to all reachable minions so mine-backed pillars (node IPs, ES/Redis/Logstash discovery) are fresh before the boot highstate renders them. The helper waits for the responsive minion set to settle (plateau) rather than for every accepted key to report up, so an intentionally powered-off minion doesn't block the update; MAX_WAIT remains as a backstop.	2026-06-08 11:05:13 -04:00
reyesj2	ac907ba45f	fix elasticsearch template generation issue	2026-06-05 16:42:08 -05:00
Josh Patterson	f957954abf	Merge pull request #15956 from Security-Onion-Solutions/nostartupstates higstate on host start, not salt-minion start	2026-06-04 16:51:10 -04:00
Josh Patterson	cb3631da81	Move setup-complete marker from /opt/so/conf to /opt/so/state The setup-complete marker is a runtime-state file, not config, so move it to /opt/so/state/setup-complete. Updates both writers (mark_setup_complete in setup/so-functions and the upgrade-path state in minion/init.sls) and the three readers (so-boot-highstate.service ConditionPathExists, boot_highstate.sls enable gate, and the so-user_sync cron gate).	2026-06-04 15:07:27 -04:00
Josh Patterson	f5d63f585e	Merge remote-tracking branch 'origin/3/dev' into nostartupstates	2026-06-04 09:19:01 -04:00
Josh Patterson	13f8be40b5	so-boot-highstate: wait for docker before running highstate Add docker.service to After= and Wants= so the boot-time highstate starts after docker is up. Uses Wants (soft) so highstate still runs if docker fails to start.	2026-06-04 08:46:35 -04:00
Jason Ertel	9ee90a5bc0	Merge pull request #15955 from Security-Onion-Solutions/jertel/wip config updates	2026-06-03 17:26:51 -04:00
Josh Patterson	2d653b6f1b	does not need to be jinja template	2026-06-03 15:46:58 -04:00
Josh Patterson	34fee25b0c	Merge remote-tracking branch 'origin/3/dev' into nostartupstates	2026-06-03 15:44:41 -04:00
Josh Patterson	fabecb8288	remove highstate from startup_states. highstate on system start	2026-05-14 13:57:40 -04:00