Harden reinstall_init and add post-saltify readiness gate

- setup/so-functions: dump pre-reinstall salt state (systemctl / journalctl / ls /etc/salt / .rpmnew diff) to the setup log so a failed reinstall leaves a usable post-mortem; swap the manual rm -rf of /etc/salt/* for `dnf -y remove salt` so package configs get cleaned up properly. - setup/so-setup: replace the `sleep 2 / state.show_top / sleep 2` dance after saltify with a readiness gate that waits for /etc/salt/pki/master/master.pub, runs check_salt_master_status, and then wait_for_minion_key_pending before salt-key -ya. Fixes reinstalls on 3.x timing out on "Unable to sign_in to master". - salt/common/tools/sbin/so-common: add wait_for_minion_key_pending helper, polls `salt-key -l pre` until the minion appears.
uninstall salt during reinstall_init
2026-05-09 04:42:40 +02:00 · 2026-04-23 17:43:39 -04:00 · 2026-04-22 16:40:47 -04:00
10 changed files with 205 additions and 210 deletions
@@ -33,8 +33,6 @@
    'kratos',
    'hydra',
    'elasticfleet',
-    'elasticfleet.manager',
-    'elasticsearch.cluster',
    'elastic-fleet-package-registry',
    'utility'
 ] %}
@@ -162,6 +162,29 @@ check_salt_master_status() {
    return 0
 }

+# Wait until $minion shows up in the salt master's unaccepted-keys list.
+# Used after saltify on a reinstall to replace the old `sleep 2 / state.show_top /
+# sleep 2` dance — the new minion's key takes longer to appear than 2s on
+# salt 3006.x and the subsequent salt-key -ya needs something to accept.
+# Returns 0 as soon as the key is pending, 1 after attempts*delay seconds.
+wait_for_minion_key_pending() {
+	local minion="$1"
+	local attempts="${2:-30}"
+	local delay="${3:-2}"
+	local count=0
+	while ! salt-key -l pre --out=json 2>/dev/null \
+		| python3 -c "import json,sys; d=json.load(sys.stdin); sys.exit(0 if '$minion' in d.get('minions_pre', []) else 1)" 2>/dev/null; do
+		((count+=1))
+		if [[ $count -ge $attempts ]]; then
+			echo "Gave up waiting for $minion to appear in salt-master's pending keys"
+			return 1
+		fi
+		sleep "$delay"
+	done
+	echo "Minion $minion is pending acceptance after $((count * delay))s"
+	return 0
+}
+
 # this is only intended to be used to check the status of the minion from a salt master
 check_salt_minion_status() {
 	local minion="$1"
@@ -17,17 +17,65 @@ include:
  - logstash.ssl
  - elasticfleet.config
  - elasticfleet.sostatus
-{%- if GLOBALS.role != "so-fleet" %}
-  - elasticfleet.manager
-{%- endif %}

-{% if GLOBALS.role != "so-fleet" %}
+{% if grains.role not in ['so-fleet'] %}
 # Wait for Elasticsearch to be ready - no reason to try running Elastic Fleet server if ES is not ready
 wait_for_elasticsearch_elasticfleet:
  cmd.run:
    - name: so-elasticsearch-wait
+{% endif %}
+
+# If enabled, automatically update Fleet Logstash Outputs
+{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval', 'so-fleet'] %}
+so-elastic-fleet-auto-configure-logstash-outputs:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-outputs-update
+    - retry:
+        attempts: 4
+        interval: 30
+
+{# Separate from above in order to catch elasticfleet-logstash.crt changes and force update to fleet output policy #}
+so-elastic-fleet-auto-configure-logstash-outputs-force:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-outputs-update --certs
+    - retry:
+        attempts: 4
+        interval: 30
+    - onchanges:
+        - x509: etc_elasticfleet_logstash_crt
+        - x509: elasticfleet_kafka_crt
+{% endif %}
+
+# If enabled, automatically update Fleet Server URLs & ES Connection
+{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-fleet'] %}
+so-elastic-fleet-auto-configure-server-urls:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-urls-update
+    - retry:
+        attempts: 4
+        interval: 30
+{% endif %}
+
+# Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
+{% if grains.role not in ['so-fleet'] %}
+so-elastic-fleet-auto-configure-elasticsearch-urls:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-es-url-update
+    - retry:
+        attempts: 4
+        interval: 30
+
+so-elastic-fleet-auto-configure-artifact-urls:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-artifacts-url-update
+    - retry:
+        attempts: 4
+        interval: 30
+
+{% endif %}

 # Sync Elastic Agent artifacts to Fleet Node
+{% if grains.role in ['so-fleet'] %}
 elasticagent_syncartifacts:
  file.recurse:
    - name: /nsm/elastic-fleet/artifacts/beats
@@ -101,6 +149,57 @@ so-elastic-fleet:
      - x509: etc_elasticfleet_crt
 {%   endif %}

+{%  if GLOBALS.role != "so-fleet" %}
+so-elastic-fleet-package-statefile:
+  file.managed:
+    - name: /opt/so/state/elastic_fleet_packages.txt
+    - contents: {{ELASTICFLEETMERGED.packages}}
+
+so-elastic-fleet-package-upgrade:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-package-upgrade
+    - retry:
+        attempts: 3
+        interval: 10
+    - onchanges:
+      - file: /opt/so/state/elastic_fleet_packages.txt
+
+so-elastic-fleet-integrations:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-integration-policy-load
+    - retry:
+        attempts: 3
+        interval: 10
+
+so-elastic-agent-grid-upgrade:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-agent-grid-upgrade
+    - retry:
+        attempts: 12
+        interval: 5
+
+so-elastic-fleet-integration-upgrade:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-integration-upgrade
+    - retry:
+        attempts: 3
+        interval: 10
+
+{# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
+so-elastic-fleet-addon-integrations:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-optional-integrations-load
+
+{%   if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
+so-elastic-defend-manage-filters-file-watch:
+  cmd.run:
+    - name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
+    - onchanges:
+      - file: elasticdefendcustom
+      - file: elasticdefenddisabled
+{%    endif %}
+{%  endif %}
+
 delete_so-elastic-fleet_so-status.disabled:
  file.uncomment:
    - name: /opt/so/conf/so-status/so-status.conf
@@ -1,112 +0,0 @@
-# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
-# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
-# https://securityonion.net/license; you may not use this file except in compliance with the
-# Elastic License 2.0.
-
-{% from 'allowed_states.map.jinja' import allowed_states %}
-{% if sls in allowed_states %}
-{%   from 'elasticfleet/map.jinja' import ELASTICFLEETMERGED %}
-
-include:
-  - elasticfleet.config
-
-# If enabled, automatically update Fleet Logstash Outputs
-{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval'] %}
-so-elastic-fleet-auto-configure-logstash-outputs:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-outputs-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-{# Separate from above in order to catch elasticfleet-logstash.crt changes and force update to fleet output policy #}
-so-elastic-fleet-auto-configure-logstash-outputs-force:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-outputs-update --certs
-    - retry:
-        attempts: 4
-        interval: 30
-    - onchanges:
-        - x509: etc_elasticfleet_logstash_crt
-        - x509: elasticfleet_kafka_crt
-{% endif %}
-
-# If enabled, automatically update Fleet Server URLs & ES Connection
-so-elastic-fleet-auto-configure-server-urls:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-urls-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-# Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
-so-elastic-fleet-auto-configure-elasticsearch-urls:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-es-url-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-so-elastic-fleet-auto-configure-artifact-urls:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-artifacts-url-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-so-elastic-fleet-package-statefile:
-  file.managed:
-    - name: /opt/so/state/elastic_fleet_packages.txt
-    - contents: {{ELASTICFLEETMERGED.packages}}
-
-so-elastic-fleet-package-upgrade:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-package-upgrade
-    - retry:
-        attempts: 3
-        interval: 10
-    - onchanges:
-      - file: /opt/so/state/elastic_fleet_packages.txt
-
-so-elastic-fleet-integrations:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-integration-policy-load
-    - retry:
-        attempts: 3
-        interval: 10
-
-so-elastic-agent-grid-upgrade:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-agent-grid-upgrade
-    - retry:
-        attempts: 12
-        interval: 5
-
-so-elastic-fleet-integration-upgrade:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-integration-upgrade
-    - retry:
-        attempts: 3
-        interval: 10
-
-{# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
-so-elastic-fleet-addon-integrations:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-optional-integrations-load
-
-{% if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
-so-elastic-defend-manage-filters-file-watch:
-  cmd.run:
-    - name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
-    - onchanges:
-      - file: elasticdefendcustom
-      - file: elasticdefenddisabled
-{% endif %}
-
-{% else %}
-
-{{sls}}_state_not_allowed:
-  test.fail_without_changes:
-    - name: {{sls}}_state_not_allowed
-
-{% endif %}
@@ -4,7 +4,7 @@
 # Elastic License 2.0.

 {% from 'allowed_states.map.jinja' import allowed_states %}
-{% if sls in allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
 {%   from 'vars/globals.map.jinja' import GLOBALS %}
 {%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS, SO_MANAGED_INDICES %}
@@ -17,7 +17,7 @@ include:
  - elasticsearch.ssl
  - elasticsearch.config
  - elasticsearch.sostatus
-{%- if GLOBALS.role != "so-searchode" %}
+{%- if GLOBALS.role != 'so-searchode' %}
  - elasticsearch.cluster
 {%- endif%}

@@ -102,6 +102,11 @@ so-elasticsearch:
      - cmd: auth_users_roles_inode
      - cmd: auth_users_inode

+delete_so-elasticsearch_so-status.disabled:
+  file.uncomment:
+    - name: /opt/so/conf/so-status/so-status.conf
+    - regex: ^so-elasticsearch$
+
 wait_for_so-elasticsearch:
  http.wait_for_successful_query:
    - name: "https://localhost:9200/"
@@ -112,14 +117,10 @@ wait_for_so-elasticsearch:
    - status: 200
    - wait_for: 300
    - request_interval: 15
+    - backend: requests
    - require:
      - docker_container: so-elasticsearch

-delete_so-elasticsearch_so-status.disabled:
-  file.uncomment:
-    - name: /opt/so/conf/so-status/so-status.conf
-    - regex: ^so-elasticsearch$
-
 {% else %}

 {{sls}}_state_not_allowed:
@@ -103,13 +103,11 @@ load_component_templates() {
    local pattern="${ELASTICSEARCH_TEMPLATES_DIR}/component/$2"
    local append_mappings="${3:-"false"}"

+    # current state of nullglob shell option
+    shopt -q nullglob && nullglob_set=1 || nullglob_set=0
+
+    shopt -s nullglob
    echo -e "\nLoading $printed_name component templates...\n"
-
-    if ! compgen -G "${pattern}/*.json" > /dev/null; then
-        echo "No $printed_name component templates found in ${pattern}, skipping."
-        return
-    fi
-
    for component in "$pattern"/*.json; do
        tmpl_name=$(basename "${component%.json}")

@@ -123,6 +121,11 @@ load_component_templates() {
            SO_LOAD_FAILURES_NAMES+=("$component")
        fi
    done
+
+    # restore nullglob shell option if needed
+    if [[ $nullglob_set -eq 1 ]]; then
+        shopt -u nullglob
+    fi
 }

 check_elasticsearch_responsive() {
@@ -133,32 +136,7 @@ check_elasticsearch_responsive() {
        fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
 }

-index_templates_exist() {
-    local templates_dir="$1"
-
-    if [[ ! -d "$templates_dir" ]]; then
-        return 1
-    fi
-
-    compgen -G "${templates_dir}/*.json" > /dev/null
-}
-
-should_load_addon_templates() {
-    if [[ "$IS_HEAVYNODE" == "true" ]]; then
-        return 1
-    fi
-
-    # Skip statefile checks when forcing template load
-    if [[ "$FORCE" != "true" ]]; then
-        if [[ ! -f "$SO_STATEFILE_SUCCESS" || -f "$ADDON_STATEFILE_SUCCESS" ]]; then
-            return 1
-        fi
-    fi
-
-    index_templates_exist "$ADDON_TEMPLATES_DIR"
-}
-
-if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_exist "$SO_TEMPLATES_DIR"; then
+if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]]; then
    check_elasticsearch_responsive

    if [[ "$IS_HEAVYNODE" == "false" ]]; then
@@ -223,14 +201,13 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
            fail "Failed to load all Security Onion core templates successfully."
        fi
    fi
-elif ! index_templates_exist "$SO_TEMPLATES_DIR"; then
-    echo "No Security Onion core index templates found in ${SO_TEMPLATES_DIR}, skipping."
-elif [[ -f "$SO_STATEFILE_SUCCESS" ]]; then
+else
+
    echo "Security Onion core templates already loaded"
 fi

 # Start loading addon templates
-if should_load_addon_templates; then
+if [[ (-d "$ADDON_TEMPLATES_DIR" && -f "$SO_STATEFILE_SUCCESS" && "$IS_HEAVYNODE" == "false" && ! -f "$ADDON_STATEFILE_SUCCESS") || (-d "$ADDON_TEMPLATES_DIR" && "$IS_HEAVYNODE" == "false" && "$FORCE" == "true") ]]; then

    check_elasticsearch_responsive

@@ -22,7 +22,7 @@ kibana:
          - default
          - file
    migrations:
-      discardCorruptObjects: "9.3.3"
+      discardCorruptObjects: "8.18.8"
    telemetry:
      enabled: False
    xpack:
@@ -202,10 +202,10 @@ check_service_status() {
 	systemctl status $service_name > /dev/null 2>&1
 	local status=$?
 	if [ $status -gt 0 ]; then
-		info "$service_name is not running" 
+		info "  $service_name is not running" 
 		return 1;
 	else
-		info "$service_name is running"
+		info "  $service_name is running"
 		return 0;
 	fi

@@ -1541,10 +1541,28 @@ clear_previous_setup_results() {
 reinstall_init() {
 	info "Putting system in state to run setup again"

-	# Always include both services. check_service_status skips units that aren't present.
-	local salt_services=( "salt-master" "salt-minion" )
+	if [[ $install_type =~ ^(MANAGER|EVAL|MANAGERSEARCH|MANAGERHYPE|STANDALONE|FLEET|IMPORT)$ ]]; then
+		local salt_services=( "salt-master" "salt-minion" )
+	else
+		local salt_services=( "salt-minion" )
+	fi
+
+	local service_retry_count=20

 	{
+		# Snapshot pre-reinstall salt state before any destructive step so a
+		# failed reinstall leaves a usable post-mortem in the setup log.
+		echo "=== pre-reinstall salt diagnostic $(date -Iseconds) ==="
+		systemctl status salt-master --no-pager 2>&1 | head -40 || true
+		systemctl status salt-minion --no-pager 2>&1 | head -40 || true
+		journalctl -u salt-master --no-pager --since "-10 minutes" 2>&1 | tail -80 || true
+		journalctl -u salt-minion --no-pager --since "-10 minutes" 2>&1 | tail -80 || true
+		ls -laR /etc/salt 2>&1 | head -60 || true
+		ls -la /var/cache/salt 2>&1 | head -40 || true
+		[[ -f /etc/salt/master.rpmnew ]] && diff -u /etc/salt/master /etc/salt/master.rpmnew 2>&1 | head -80 || true
+		[[ -f /etc/salt/minion.rpmnew ]] && diff -u /etc/salt/minion /etc/salt/minion.rpmnew 2>&1 | head -40 || true
+		echo "=== end diagnostic ==="
+
 		# remove all of root's cronjobs
 		crontab -r -u root

@@ -1558,51 +1576,31 @@ reinstall_init() {

 		salt-call state.apply ca.remove -linfo --local --file-root=../salt

-		# Stop salt services and force-kill any lingering salt processes (including orphans
-		# from an earlier reinstall attempt where the unit file is gone but processes survive)
-		# so dnf remove salt can run cleanly
+		# Kill any salt processes (safely)
 		for service in "${salt_services[@]}"; do
+			# Stop the service in the background so we can exit after a certain amount of time
 			if check_service_status "$service"; then
-				info "Stopping $service via systemctl"
-				systemctl stop "$service"
+				systemctl stop "$service" &
 			fi
+			local pid=$!
+
+			local count=0
+			while check_service_status "$service"; do
+				if [[ $count -gt $service_retry_count ]]; then
+					echo "Could not stop $service after 1 minute, exiting setup."
+
+					# Stop the systemctl process trying to kill the service, show user a message, then exit setup
+					kill -9 $pid
+					fail_setup
+				fi
+
+				sleep 5
+				((count++))
+			done
 		done

-		# Unconditionally force-kill any remaining salt binaries — these may be orphaned
-		# from a prior aborted reinstall (no unit file, so systemctl can't see them).
-		for salt_bin in salt-master salt-minion salt-call salt-cloud; do
-			if pgrep -f "/usr/bin/${salt_bin}" > /dev/null 2>&1; then
-				info "Force-killing lingering $salt_bin processes"
-				pkill -9 -ef "/usr/bin/${salt_bin}" 2>/dev/null
-			fi
-		done
-		# Catch stray `salt` CLI children from saltutil.kill_all_jobs / state.apply invocations
-		pkill -9 -ef "/usr/bin/python3 /bin/salt" 2>/dev/null
-
-		# Give the kernel a moment to reap the killed processes before dnf removes the binaries
-		local kill_wait=0
-		while pgrep -f "/usr/bin/salt-" > /dev/null 2>&1; do
-			if [[ $kill_wait -gt 10 ]]; then
-				info "Salt processes still present after SIGKILL + 10s wait; proceeding anyway"
-				pgrep -af "/usr/bin/salt-" | while read -r line; do info "  lingering: $line"; done
-				break
-			fi
-			sleep 1
-			((kill_wait++))
-		done
-
-		# Clear the 'failed' state SIGKILL left on the units before removing the package
-		systemctl reset-failed salt-master.service salt-minion.service 2>/dev/null || true
-
-		# Remove all salt configs
+		# Uninstall salt so configs and directories are removed and reinstall reconfigures directories
 		dnf -y remove salt
-		rm -rf /etc/salt/ /var/cache/salt/
-
-		# Drop systemd's in-memory references to the now-removed units
-		systemctl daemon-reload
-
-		# Uninstall local Elastic Agent, if installed
-		elastic-agent uninstall -f

 		if command -v docker &> /dev/null; then
 			# Stop and remove all so-* containers so files can be changed with more safety
@@ -1626,7 +1624,10 @@ reinstall_init() {
 		backup_dir /nsm/hydra "$date_string"
 		backup_dir /nsm/influxdb "$date_string"

-	} 2>&1 | tee -a "$setup_log"
+		# Uninstall local Elastic Agent, if installed
+		elastic-agent uninstall -f
+
+	} >> "$setup_log" 2>&1

 	info "System reinstall init has been completed."
 }
@@ -219,7 +219,7 @@ if [ -n "$test_profile" ]; then
 	WEBUSER=onionuser@somewhere.invalid
 	WEBPASSWD1=0n10nus3r
 	WEBPASSWD2=0n10nus3r
-	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MSRVIP_OFFSET}"
+	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MAINIP}"

 	update_sudoers_for_testing
 fi
@@ -724,10 +724,18 @@ if ! [[ -f $install_opt_file ]]; then
 		# Install salt
 		saltify
 		check_sos_appliance
+		# Wait for salt-master to be actually running and have its PKI
+		# ready after a fresh saltify. Without this, salt-key operations
+		# silently race the daemon and the key accept no-ops, which is
+		# what was causing reinstalls on 3.x to hang on state.show_top.
+		retry 30 2 "test -f /etc/salt/pki/master/master.pub" \
+			|| fail "salt-master did not initialize PKI after saltify"
+		check_salt_master_status \
+			|| fail "salt-master not accepting calls after saltify"
+
 		logCmd "salt-key -yd $MINION_ID"
-		sleep 2 # Debug RSA Key format errors
-		logCmd "salt-call state.show_top"
-		sleep 2 # Debug RSA Key format errors
+		wait_for_minion_key_pending "$MINION_ID" 30 2 \
+			|| fail "salt-minion never presented its key to salt-master"
 		logCmd "salt-key -ya $MINION_ID"
 		logCmd "salt-call saltutil.sync_all"
 		# we need to sync the runner and generate the soqemussh user keys so that first highstate after license created