Compare commits

...

12 Commits

Author SHA1 Message Date
Mike Reeves cab57edf7c Harden reinstall_init and add post-saltify readiness gate
- setup/so-functions: dump pre-reinstall salt state (systemctl /
  journalctl / ls /etc/salt / .rpmnew diff) to the setup log so a
  failed reinstall leaves a usable post-mortem; swap the manual
  rm -rf of /etc/salt/* for `dnf -y remove salt` so package configs
  get cleaned up properly.
- setup/so-setup: replace the `sleep 2 / state.show_top / sleep 2`
  dance after saltify with a readiness gate that waits for
  /etc/salt/pki/master/master.pub, runs check_salt_master_status,
  and then wait_for_minion_key_pending before salt-key -ya. Fixes
  reinstalls on 3.x timing out on "Unable to sign_in to master".
- salt/common/tools/sbin/so-common: add wait_for_minion_key_pending
  helper, polls `salt-key -l pre` until the minion appears.
2026-04-23 17:43:39 -04:00
Josh Patterson 8b2064f1c1 uninstall salt during reinstall_init 2026-04-22 16:40:47 -04:00
Josh Patterson cd6707a566 Merge pull request #15800 from Security-Onion-Solutions/feature/vm-raid-status
monitor raid for vms
2026-04-22 09:42:44 -04:00
Josh Patterson edd207a9d5 soup update socloud.conf 2026-04-22 09:20:53 -04:00
Jorge Reyes 01bd3b6e06 Merge pull request #15807 from Security-Onion-Solutions/reyesj2-es933
urlencode elasticsearch version
2026-04-21 14:11:04 -05:00
reyesj2 06a555fafb urlencode elasticsearch version 2026-04-21 14:01:31 -05:00
Jason Ertel 7411031e11 Merge pull request #15803 from Security-Onion-Solutions/jertel/wip
more error handling during image updates
2026-04-21 10:21:56 -04:00
Jason Ertel 247091766c more error handling during image updates 2026-04-21 10:18:05 -04:00
Josh Patterson 7f93110d68 Merge remote-tracking branch 'origin/3/dev' into feature/vm-raid-status 2026-04-21 10:10:38 -04:00
Jason Ertel 33ef138866 Merge pull request #15797 from Security-Onion-Solutions/jertel/wip
fix template annotation
2026-04-20 17:14:53 -04:00
Jason Ertel 71da27dc8e fix template annotation 2026-04-20 17:02:25 -04:00
Josh Patterson ee437265fc monitor raid for vms 2026-04-20 12:00:02 -04:00
9 changed files with 93 additions and 28 deletions
+23
View File
@@ -162,6 +162,29 @@ check_salt_master_status() {
return 0
}
# Wait until $minion shows up in the salt master's unaccepted-keys list.
# Used after saltify on a reinstall to replace the old `sleep 2 / state.show_top /
# sleep 2` dance — the new minion's key takes longer to appear than 2s on
# salt 3006.x and the subsequent salt-key -ya needs something to accept.
# Returns 0 as soon as the key is pending, 1 after attempts*delay seconds.
wait_for_minion_key_pending() {
local minion="$1"
local attempts="${2:-30}"
local delay="${3:-2}"
local count=0
while ! salt-key -l pre --out=json 2>/dev/null \
| python3 -c "import json,sys; d=json.load(sys.stdin); sys.exit(0 if '$minion' in d.get('minions_pre', []) else 1)" 2>/dev/null; do
((count+=1))
if [[ $count -ge $attempts ]]; then
echo "Gave up waiting for $minion to appear in salt-master's pending keys"
return 1
fi
sleep "$delay"
done
echo "Minion $minion is pending acceptance after $((count * delay))s"
return 0
}
# this is only intended to be used to check the status of the minion from a salt master
check_salt_minion_status() {
local minion="$1"
+8 -2
View File
@@ -186,8 +186,14 @@ update_docker_containers() {
if [ -z "$HOSTNAME" ]; then
HOSTNAME=$(hostname)
fi
docker tag $CONTAINER_REGISTRY/$IMAGEREPO/$image $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1
docker push $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1
docker tag $CONTAINER_REGISTRY/$IMAGEREPO/$image $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1 || {
echo "Unable to tag $image" >> "$LOG_FILE" 2>&1
exit 1
}
docker push $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1 || {
echo "Unable to push $image" >> "$LOG_FILE" 2>&1
exit 1
}
fi
else
echo "There is a problem downloading the $image image. Details: " >> "$LOG_FILE" 2>&1
+9 -2
View File
@@ -9,7 +9,7 @@
. /usr/sbin/so-common
software_raid=("SOSMN" "SOSMN-DE02" "SOSSNNV" "SOSSNNV-DE02" "SOS10k-DE02" "SOS10KNV" "SOS10KNV-DE02" "SOS10KNV-DE02" "SOS2000-DE02" "SOS-GOFAST-LT-DE02" "SOS-GOFAST-MD-DE02" "SOS-GOFAST-HV-DE02")
software_raid=("SOSMN" "SOSMN-DE02" "SOSSNNV" "SOSSNNV-DE02" "SOS10k-DE02" "SOS10KNV" "SOS10KNV-DE02" "SOS10KNV-DE02" "SOS2000-DE02" "SOS-GOFAST-LT-DE02" "SOS-GOFAST-MD-DE02" "SOS-GOFAST-HV-DE02" "HVGUEST")
hardware_raid=("SOS1000" "SOS1000F" "SOSSN7200" "SOS5000" "SOS4000")
{%- if salt['grains.get']('sosmodel', '') %}
@@ -87,6 +87,11 @@ check_boss_raid() {
}
check_software_raid() {
if [[ ! -f /proc/mdstat ]]; then
SWRAID=0
return
fi
SWRC=$(grep "_" /proc/mdstat)
if [[ -n $SWRC ]]; then
# RAID is failed in some way
@@ -107,7 +112,9 @@ if [[ "$is_hwraid" == "true" ]]; then
fi
if [[ "$is_softwareraid" == "true" ]]; then
check_software_raid
check_boss_raid
if [ "$model" != "HVGUEST" ]; then
check_boss_raid
fi
fi
sum=$(($SWRAID + $BOSSRAID + $HWRAID))
@@ -5,11 +5,12 @@
# this file except in compliance with the Elastic License 2.0.
. /usr/sbin/so-common
. /usr/sbin/so-elastic-fleet-common
{%- import_yaml 'elasticsearch/defaults.yaml' as ELASTICSEARCHDEFAULTS %}
{%- import_yaml 'elasticfleet/defaults.yaml' as ELASTICFLEETDEFAULTS %}
{# Optionally override Elasticsearch version for Elastic Agent patch releases #}
{%- if ELASTICFLEETDEFAULTS.elasticfleet.patch_version is defined %}
{%- do ELASTICSEARCHDEFAULTS.update({'elasticsearch': {'version': ELASTICFLEETDEFAULTS.elasticfleet.patch_version}}) %}
{%- do ELASTICSEARCHDEFAULTS.elasticsearch.update({'version': ELASTICFLEETDEFAULTS.elasticfleet.patch_version}) %}
{%- endif %}
# Only run on Managers
@@ -19,13 +20,10 @@ if ! is_manager_node; then
fi
# Get current list of Grid Node Agents that need to be upgraded
RAW_JSON=$(curl -K /opt/so/conf/elasticsearch/curl.config -L "http://localhost:5601/api/fleet/agents?perPage=20&page=1&kuery=NOT%20agent.version%3A%20{{ELASTICSEARCHDEFAULTS.elasticsearch.version}}%20AND%20policy_id%3A%20so-grid-nodes_%2A&showInactive=false&getStatusSummary=true" --retry 3 --retry-delay 30 --fail 2>/dev/null)
if ! RAW_JSON=$(fleet_api "agents?perPage=20&page=1&kuery=NOT%20agent.version%3A%20{{ELASTICSEARCHDEFAULTS.elasticsearch.version | urlencode }}%20AND%20policy_id%3A%20so-grid-nodes_%2A&showInactive=false&getStatusSummary=true" -H 'kbn-xsrf: true' -H 'Content-Type: application/json'); then
# Check to make sure that the server responded with good data - else, bail from script
CHECKSUM=$(jq -r '.page' <<< "$RAW_JSON")
if [ "$CHECKSUM" -ne 1 ]; then
printf "Failed to query for current Grid Agents...\n"
exit 1
printf "Failed to query for current Grid Agents...\n"
exit 1
fi
# Generate list of Node Agents that need updates
@@ -36,10 +34,12 @@ if [ "$OUTDATED_LIST" != '[]' ]; then
printf "Initiating upgrades for $AGENTNUMBERS Agents to Elastic {{ELASTICSEARCHDEFAULTS.elasticsearch.version}}...\n\n"
# Generate updated JSON payload
JSON_STRING=$(jq -n --arg ELASTICVERSION {{ELASTICSEARCHDEFAULTS.elasticsearch.version}} --arg UPDATELIST $OUTDATED_LIST '{"version": $ELASTICVERSION,"agents": $UPDATELIST }')
JSON_STRING=$(jq -n --arg ELASTICVERSION "{{ELASTICSEARCHDEFAULTS.elasticsearch.version}}" --argjson UPDATELIST "$OUTDATED_LIST" '{"version": $ELASTICVERSION,"agents": $UPDATELIST }')
# Update Node Agents
curl -K /opt/so/conf/elasticsearch/curl.config -L -X POST "http://localhost:5601/api/fleet/agents/bulk_upgrade" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING"
if ! fleet_api "agents/bulk_upgrade" -XPOST -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING"; then
printf "Failed to initiate Agent upgrades...\n"
fi
else
printf "No Agents need updates... Exiting\n\n"
exit 0
+12 -9
View File
@@ -24,6 +24,14 @@ BACKUPTOPFILE=/opt/so/saltstack/default/salt/top.sls.backup
SALTUPGRADED=false
SALT_CLOUD_INSTALLED=false
SALT_CLOUD_CONFIGURED=false
# Check if salt-cloud is installed
if rpm -q salt-cloud &>/dev/null; then
SALT_CLOUD_INSTALLED=true
fi
# Check if salt-cloud is configured
if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
SALT_CLOUD_CONFIGURED=true
fi
# used to display messages to the user at the end of soup
declare -a FINAL_MESSAGE_QUEUE=()
@@ -489,6 +497,10 @@ up_to_3.1.0() {
post_to_3.1.0() {
/usr/sbin/so-kibana-space-defaults
# ensure manager has new version of socloud.conf
if [[ $SALT_CLOUD_CONFIGURED == true ]]; then
salt-call state.apply salt.cloud.config concurrent=True
fi
POSTVERSION=3.1.0
}
@@ -663,15 +675,6 @@ upgrade_check_salt() {
upgrade_salt() {
echo "Performing upgrade of Salt from $INSTALLEDSALTVERSION to $NEWSALTVERSION."
echo ""
# Check if salt-cloud is installed
if rpm -q salt-cloud &>/dev/null; then
SALT_CLOUD_INSTALLED=true
fi
# Check if salt-cloud is configured
if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
SALT_CLOUD_CONFIGURED=true
fi
echo "Removing yum versionlock for Salt."
echo ""
yum versionlock delete "salt"
@@ -27,6 +27,7 @@ sool9_{{host}}:
log_file: /opt/so/log/salt/minion
grains:
hypervisor_host: {{host ~ "_" ~ role}}
sosmodel: HVGUEST
preflight_cmds:
- |
{%- set hostnames = [MANAGERHOSTNAME] %}
+4
View File
@@ -890,12 +890,16 @@ soc:
suricata:
description: The template used when creating a new Suricata detection. [publicId] will be replaced with an unused Public Id.
multiline: True
forcedType: string
strelka:
description: The template used when creating a new Strelka detection.
multiline: True
forcedType: string
elastalert:
description: The template used when creating a new ElastAlert detection. [publicId] will be replaced with an unused Public Id.
multiline: True
forcedType: string
grid:
maxUploadSize:
description: The maximum number of bytes for an uploaded PCAP import file.
+16 -3
View File
@@ -1550,6 +1550,19 @@ reinstall_init() {
local service_retry_count=20
{
# Snapshot pre-reinstall salt state before any destructive step so a
# failed reinstall leaves a usable post-mortem in the setup log.
echo "=== pre-reinstall salt diagnostic $(date -Iseconds) ==="
systemctl status salt-master --no-pager 2>&1 | head -40 || true
systemctl status salt-minion --no-pager 2>&1 | head -40 || true
journalctl -u salt-master --no-pager --since "-10 minutes" 2>&1 | tail -80 || true
journalctl -u salt-minion --no-pager --since "-10 minutes" 2>&1 | tail -80 || true
ls -laR /etc/salt 2>&1 | head -60 || true
ls -la /var/cache/salt 2>&1 | head -40 || true
[[ -f /etc/salt/master.rpmnew ]] && diff -u /etc/salt/master /etc/salt/master.rpmnew 2>&1 | head -80 || true
[[ -f /etc/salt/minion.rpmnew ]] && diff -u /etc/salt/minion /etc/salt/minion.rpmnew 2>&1 | head -40 || true
echo "=== end diagnostic ==="
# remove all of root's cronjobs
crontab -r -u root
@@ -1580,14 +1593,14 @@ reinstall_init() {
kill -9 $pid
fail_setup
fi
sleep 5
((count++))
done
done
# Remove all salt configs
rm -rf /etc/salt/engines/* /etc/salt/grains /etc/salt/master /etc/salt/master.d/* /etc/salt/minion /etc/salt/minion.d/* /etc/salt/pki/* /etc/salt/proxy /etc/salt/proxy.d/* /var/cache/salt/
# Uninstall salt so configs and directories are removed and reinstall reconfigures directories
dnf -y remove salt
if command -v docker &> /dev/null; then
# Stop and remove all so-* containers so files can be changed with more safety
+11 -3
View File
@@ -724,10 +724,18 @@ if ! [[ -f $install_opt_file ]]; then
# Install salt
saltify
check_sos_appliance
# Wait for salt-master to be actually running and have its PKI
# ready after a fresh saltify. Without this, salt-key operations
# silently race the daemon and the key accept no-ops, which is
# what was causing reinstalls on 3.x to hang on state.show_top.
retry 30 2 "test -f /etc/salt/pki/master/master.pub" \
|| fail "salt-master did not initialize PKI after saltify"
check_salt_master_status \
|| fail "salt-master not accepting calls after saltify"
logCmd "salt-key -yd $MINION_ID"
sleep 2 # Debug RSA Key format errors
logCmd "salt-call state.show_top"
sleep 2 # Debug RSA Key format errors
wait_for_minion_key_pending "$MINION_ID" 30 2 \
|| fail "salt-minion never presented its key to salt-master"
logCmd "salt-key -ya $MINION_ID"
logCmd "salt-call saltutil.sync_all"
# we need to sync the runner and generate the soqemussh user keys so that first highstate after license created