stop salt-minion and salt-master regardless of install type. display reinstall on console and save to logfile

uninstall elastic-agent before stopping dockers on reinstall
stop salt-cloud , belt-and-suspenders against a broken/incomplete salt RPM
2026-05-07 03:48:06 +02:00 · 2026-04-24 15:24:11 -04:00 · 2026-04-24 14:35:11 -04:00 · 2026-04-24 11:33:21 -04:00 · 2026-04-24 11:04:46 -04:00 · 2026-04-24 10:46:29 -04:00
9 changed files with 75 additions and 49 deletions
@@ -33,6 +33,8 @@
    'kratos',
    'hydra',
    'elasticfleet',
+    'elasticfleet.manager',
+    'elasticsearch.cluster',
    'elastic-fleet-package-registry',
    'utility'
 ] %}
@@ -9,7 +9,7 @@

 . /usr/sbin/so-common

-software_raid=("SOSMN" "SOSMN-DE02" "SOSSNNV" "SOSSNNV-DE02" "SOS10k-DE02" "SOS10KNV" "SOS10KNV-DE02" "SOS10KNV-DE02" "SOS2000-DE02" "SOS-GOFAST-LT-DE02" "SOS-GOFAST-MD-DE02" "SOS-GOFAST-HV-DE02")
+software_raid=("SOSMN" "SOSMN-DE02" "SOSSNNV" "SOSSNNV-DE02" "SOS10k-DE02" "SOS10KNV" "SOS10KNV-DE02" "SOS10KNV-DE02" "SOS2000-DE02" "SOS-GOFAST-LT-DE02" "SOS-GOFAST-MD-DE02" "SOS-GOFAST-HV-DE02" "HVGUEST")
 hardware_raid=("SOS1000" "SOS1000F" "SOSSN7200" "SOS5000" "SOS4000")

 {%- if salt['grains.get']('sosmodel', '') %}
@@ -87,6 +87,11 @@ check_boss_raid() {
 }

 check_software_raid() {
+  if [[ ! -f /proc/mdstat ]]; then
+    SWRAID=0
+    return
+  fi
+
  SWRC=$(grep "_" /proc/mdstat)
  if [[ -n $SWRC ]]; then
      # RAID is failed in some way
@@ -107,7 +112,9 @@ if [[ "$is_hwraid" == "true" ]]; then
 fi
 if [[ "$is_softwareraid" == "true" ]]; then
 	check_software_raid
-  check_boss_raid
+  if [ "$model" != "HVGUEST" ]; then
+    check_boss_raid
+  fi
 fi

 sum=$(($SWRAID + $BOSSRAID + $HWRAID))
@@ -21,7 +21,7 @@ include:
  - elasticfleet.manager
 {%- endif %}

-{% if GLOBALS.role not in ['so-fleet'] %}
+{% if GLOBALS.role != "so-fleet" %}
 # Wait for Elasticsearch to be ready - no reason to try running Elastic Fleet server if ES is not ready
 wait_for_elasticsearch_elasticfleet:
  cmd.run:
@@ -4,11 +4,12 @@
 # Elastic License 2.0.

 {% from 'allowed_states.map.jinja' import allowed_states %}
-{% if sls.split('.')[0] in allowed_states %}
-{%   from 'vars/globals.map.jinja' import GLOBALS %}
-{%   from 'docker/docker.map.jinja' import DOCKERMERGED %}
+{% if sls in allowed_states %}
 {%   from 'elasticfleet/map.jinja' import ELASTICFLEETMERGED %}

+include:
+  - elasticfleet.config
+
 # If enabled, automatically update Fleet Logstash Outputs
 {% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval'] %}
 so-elastic-fleet-auto-configure-logstash-outputs:
@@ -4,7 +4,7 @@
 # Elastic License 2.0.

 {% from 'allowed_states.map.jinja' import allowed_states %}
-{% if sls.split('.')[0] in allowed_states %}
+{% if sls in allowed_states %}
 {%   from 'vars/globals.map.jinja' import GLOBALS %}
 {%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS, SO_MANAGED_INDICES %}
@@ -24,6 +24,14 @@ BACKUPTOPFILE=/opt/so/saltstack/default/salt/top.sls.backup
 SALTUPGRADED=false
 SALT_CLOUD_INSTALLED=false
 SALT_CLOUD_CONFIGURED=false
+# Check if salt-cloud is installed
+if rpm -q salt-cloud &>/dev/null; then
+  SALT_CLOUD_INSTALLED=true
+fi
+# Check if salt-cloud is configured
+if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
+  SALT_CLOUD_CONFIGURED=true
+fi
 # used to display messages to the user at the end of soup
 declare -a FINAL_MESSAGE_QUEUE=()

@@ -489,6 +497,10 @@ up_to_3.1.0() {

 post_to_3.1.0() {
  /usr/sbin/so-kibana-space-defaults
+  # ensure manager has new version of socloud.conf
+  if [[ $SALT_CLOUD_CONFIGURED == true ]]; then
+    salt-call state.apply salt.cloud.config concurrent=True
+  fi

  POSTVERSION=3.1.0
 }
@@ -663,15 +675,6 @@ upgrade_check_salt() {
 upgrade_salt() {
  echo "Performing upgrade of Salt from $INSTALLEDSALTVERSION to $NEWSALTVERSION."
  echo ""
-  # Check if salt-cloud is installed
-  if rpm -q salt-cloud &>/dev/null; then
-    SALT_CLOUD_INSTALLED=true
-  fi
-  # Check if salt-cloud is configured
-  if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
-    SALT_CLOUD_CONFIGURED=true
-  fi
-
  echo "Removing yum versionlock for Salt."
  echo ""
  yum versionlock delete "salt"
@@ -27,6 +27,7 @@ sool9_{{host}}:
    log_file: /opt/so/log/salt/minion
  grains:
    hypervisor_host: {{host ~ "_" ~ role}}
+    sosmodel: HVGUEST
  preflight_cmds:
    - |
      {%- set hostnames = [MANAGERHOSTNAME] %}
@@ -202,10 +202,10 @@ check_service_status() {
 	systemctl status $service_name > /dev/null 2>&1
 	local status=$?
 	if [ $status -gt 0 ]; then
-		info "  $service_name is not running" 
+		info "$service_name is not running" 
 		return 1;
 	else
-		info "  $service_name is running"
+		info "$service_name is running"
 		return 0;
 	fi

@@ -1541,13 +1541,8 @@ clear_previous_setup_results() {
 reinstall_init() {
 	info "Putting system in state to run setup again"

-	if [[ $install_type =~ ^(MANAGER|EVAL|MANAGERSEARCH|MANAGERHYPE|STANDALONE|FLEET|IMPORT)$ ]]; then
-		local salt_services=( "salt-master" "salt-minion" )
-	else
-		local salt_services=( "salt-minion" )
-	fi
-
-	local service_retry_count=20
+	# Always include both services. check_service_status skips units that aren't present.
+	local salt_services=( "salt-master" "salt-minion" )

 	{
 		# remove all of root's cronjobs
@@ -1563,31 +1558,51 @@ reinstall_init() {

 		salt-call state.apply ca.remove -linfo --local --file-root=../salt

-		# Kill any salt processes (safely)
+		# Stop salt services and force-kill any lingering salt processes (including orphans
+		# from an earlier reinstall attempt where the unit file is gone but processes survive)
+		# so dnf remove salt can run cleanly
 		for service in "${salt_services[@]}"; do
-			# Stop the service in the background so we can exit after a certain amount of time
 			if check_service_status "$service"; then
-				systemctl stop "$service" &
+				info "Stopping $service via systemctl"
+				systemctl stop "$service"
 			fi
-			local pid=$!
-
-			local count=0
-			while check_service_status "$service"; do
-				if [[ $count -gt $service_retry_count ]]; then
-					echo "Could not stop $service after 1 minute, exiting setup."
-
-					# Stop the systemctl process trying to kill the service, show user a message, then exit setup
-					kill -9 $pid
-					fail_setup
-				fi
-				
-				sleep 5
-				((count++))
-			done
 		done

+		# Unconditionally force-kill any remaining salt binaries — these may be orphaned
+		# from a prior aborted reinstall (no unit file, so systemctl can't see them).
+		for salt_bin in salt-master salt-minion salt-call salt-cloud; do
+			if pgrep -f "/usr/bin/${salt_bin}" > /dev/null 2>&1; then
+				info "Force-killing lingering $salt_bin processes"
+				pkill -9 -ef "/usr/bin/${salt_bin}" 2>/dev/null
+			fi
+		done
+		# Catch stray `salt` CLI children from saltutil.kill_all_jobs / state.apply invocations
+		pkill -9 -ef "/usr/bin/python3 /bin/salt" 2>/dev/null
+
+		# Give the kernel a moment to reap the killed processes before dnf removes the binaries
+		local kill_wait=0
+		while pgrep -f "/usr/bin/salt-" > /dev/null 2>&1; do
+			if [[ $kill_wait -gt 10 ]]; then
+				info "Salt processes still present after SIGKILL + 10s wait; proceeding anyway"
+				pgrep -af "/usr/bin/salt-" | while read -r line; do info "  lingering: $line"; done
+				break
+			fi
+			sleep 1
+			((kill_wait++))
+		done
+
+		# Clear the 'failed' state SIGKILL left on the units before removing the package
+		systemctl reset-failed salt-master.service salt-minion.service 2>/dev/null || true
+
 		# Remove all salt configs
-		rm -rf /etc/salt/engines/* /etc/salt/grains /etc/salt/master /etc/salt/master.d/* /etc/salt/minion /etc/salt/minion.d/* /etc/salt/pki/* /etc/salt/proxy /etc/salt/proxy.d/* /var/cache/salt/
+		dnf -y remove salt
+		rm -rf /etc/salt/ /var/cache/salt/
+
+		# Drop systemd's in-memory references to the now-removed units
+		systemctl daemon-reload
+
+		# Uninstall local Elastic Agent, if installed
+		elastic-agent uninstall -f

 		if command -v docker &> /dev/null; then
 			# Stop and remove all so-* containers so files can be changed with more safety
@@ -1611,10 +1626,7 @@ reinstall_init() {
 		backup_dir /nsm/hydra "$date_string"
 		backup_dir /nsm/influxdb "$date_string"

-		# Uninstall local Elastic Agent, if installed
-		elastic-agent uninstall -f
-
-	} >> "$setup_log" 2>&1
+	} 2>&1 | tee -a "$setup_log"

 	info "System reinstall init has been completed."
 }
@@ -219,7 +219,7 @@ if [ -n "$test_profile" ]; then
 	WEBUSER=onionuser@somewhere.invalid
 	WEBPASSWD1=0n10nus3r
 	WEBPASSWD2=0n10nus3r
-	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MAINIP}"
+	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MSRVIP_OFFSET}"

 	update_sudoers_for_testing
 fi
Author	SHA1	Message	Date
Josh Patterson	199c2746f1	stop salt-minion and salt-master regardless of install type. display reinstall on console and save to logfile	2026-04-24 15:24:11 -04:00
Josh Patterson	8eca465ef6	uninstall elastic-agent before stopping dockers on reinstall	2026-04-24 14:35:11 -04:00
Josh Patterson	02381fbbe9	stop salt-cloud , belt-and-suspenders against a broken/incomplete salt RPM	2026-04-24 11:33:21 -04:00
Josh Patterson	0722b681b1	redo service stop on reinstall	2026-04-24 11:04:46 -04:00
Josh Patterson	564815e836	redo how services are stopped during reinstall	2026-04-24 10:46:29 -04:00
Jason Ertel	ba55468da8	Merge pull request #15822 from Security-Onion-Solutions/jertel/wip numeric test description	2026-04-24 08:26:55 -04:00
Jason Ertel	cdd217283d	numeric test description	2026-04-24 08:13:36 -04:00
Jorge Reyes	810a582717	Merge pull request #15813 from Security-Onion-Solutions/reyesj2-es933 split up Elastic Fleet state	2026-04-23 14:51:32 -05:00
Mike Reeves	5f35554fdc	Merge pull request #15712 from Security-Onion-Solutions/soupfix Fix soup	2026-04-23 12:39:50 -04:00
reyesj2	fdfca469cc	prevent non-manager nodes from running elasticsearch.cluster state manually	2026-04-23 09:53:07 -05:00
reyesj2	5f2ec76ba8	prevent fleetnode from being able to run elasticfleet.manager state manually	2026-04-23 09:50:45 -05:00
reyesj2	b015c8ff14	remove docker import	2026-04-23 09:31:30 -05:00
reyesj2	7e70870a9e	remove globals import	2026-04-23 09:25:36 -05:00
reyesj2	22b32a16dd	include elasticfleet.config	2026-04-23 08:30:47 -05:00
Josh Patterson	cd6707a566	Merge pull request #15800 from Security-Onion-Solutions/feature/vm-raid-status monitor raid for vms	2026-04-22 09:42:44 -04:00
Josh Patterson	edd207a9d5	soup update socloud.conf	2026-04-22 09:20:53 -04:00
Jorge Reyes	01bd3b6e06	Merge pull request #15807 from Security-Onion-Solutions/reyesj2-es933 urlencode elasticsearch version	2026-04-21 14:11:04 -05:00
Josh Patterson	7f93110d68	Merge remote-tracking branch 'origin/3/dev' into feature/vm-raid-status	2026-04-21 10:10:38 -04:00
Josh Patterson	ee437265fc	monitor raid for vms	2026-04-20 12:00:02 -04:00
Mike Reeves	664f3fd18a	Fix soup	2026-04-01 14:47:05 -04:00