Compare commits

..

2 Commits

Author SHA1 Message Date
Mike Reeves cab57edf7c Harden reinstall_init and add post-saltify readiness gate
- setup/so-functions: dump pre-reinstall salt state (systemctl /
  journalctl / ls /etc/salt / .rpmnew diff) to the setup log so a
  failed reinstall leaves a usable post-mortem; swap the manual
  rm -rf of /etc/salt/* for `dnf -y remove salt` so package configs
  get cleaned up properly.
- setup/so-setup: replace the `sleep 2 / state.show_top / sleep 2`
  dance after saltify with a readiness gate that waits for
  /etc/salt/pki/master/master.pub, runs check_salt_master_status,
  and then wait_for_minion_key_pending before salt-key -ya. Fixes
  reinstalls on 3.x timing out on "Unable to sign_in to master".
- salt/common/tools/sbin/so-common: add wait_for_minion_key_pending
  helper, polls `salt-key -l pre` until the minion appears.
2026-04-23 17:43:39 -04:00
Josh Patterson 8b2064f1c1 uninstall salt during reinstall_init 2026-04-22 16:40:47 -04:00
12 changed files with 177 additions and 171 deletions
-2
View File
@@ -33,8 +33,6 @@
'kratos',
'hydra',
'elasticfleet',
'elasticfleet.manager',
'elasticsearch.cluster',
'elastic-fleet-package-registry',
'utility'
] %}
+23
View File
@@ -162,6 +162,29 @@ check_salt_master_status() {
return 0
}
# Wait until $minion shows up in the salt master's unaccepted-keys list.
# Used after saltify on a reinstall to replace the old `sleep 2 / state.show_top /
# sleep 2` dance — the new minion's key takes longer to appear than 2s on
# salt 3006.x and the subsequent salt-key -ya needs something to accept.
# Returns 0 as soon as the key is pending, 1 after attempts*delay seconds.
wait_for_minion_key_pending() {
local minion="$1"
local attempts="${2:-30}"
local delay="${3:-2}"
local count=0
while ! salt-key -l pre --out=json 2>/dev/null \
| python3 -c "import json,sys; d=json.load(sys.stdin); sys.exit(0 if '$minion' in d.get('minions_pre', []) else 1)" 2>/dev/null; do
((count+=1))
if [[ $count -ge $attempts ]]; then
echo "Gave up waiting for $minion to appear in salt-master's pending keys"
return 1
fi
sleep "$delay"
done
echo "Minion $minion is pending acceptance after $((count * delay))s"
return 0
}
# this is only intended to be used to check the status of the minion from a salt master
check_salt_minion_status() {
local minion="$1"
+103 -4
View File
@@ -17,17 +17,65 @@ include:
- logstash.ssl
- elasticfleet.config
- elasticfleet.sostatus
{%- if GLOBALS.role != "so-fleet" %}
- elasticfleet.manager
{%- endif %}
{% if GLOBALS.role != "so-fleet" %}
{% if grains.role not in ['so-fleet'] %}
# Wait for Elasticsearch to be ready - no reason to try running Elastic Fleet server if ES is not ready
wait_for_elasticsearch_elasticfleet:
cmd.run:
- name: so-elasticsearch-wait
{% endif %}
# If enabled, automatically update Fleet Logstash Outputs
{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval', 'so-fleet'] %}
so-elastic-fleet-auto-configure-logstash-outputs:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-outputs-update
- retry:
attempts: 4
interval: 30
{# Separate from above in order to catch elasticfleet-logstash.crt changes and force update to fleet output policy #}
so-elastic-fleet-auto-configure-logstash-outputs-force:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-outputs-update --certs
- retry:
attempts: 4
interval: 30
- onchanges:
- x509: etc_elasticfleet_logstash_crt
- x509: elasticfleet_kafka_crt
{% endif %}
# If enabled, automatically update Fleet Server URLs & ES Connection
{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-fleet'] %}
so-elastic-fleet-auto-configure-server-urls:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-urls-update
- retry:
attempts: 4
interval: 30
{% endif %}
# Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
{% if grains.role not in ['so-fleet'] %}
so-elastic-fleet-auto-configure-elasticsearch-urls:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-es-url-update
- retry:
attempts: 4
interval: 30
so-elastic-fleet-auto-configure-artifact-urls:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-artifacts-url-update
- retry:
attempts: 4
interval: 30
{% endif %}
# Sync Elastic Agent artifacts to Fleet Node
{% if grains.role in ['so-fleet'] %}
elasticagent_syncartifacts:
file.recurse:
- name: /nsm/elastic-fleet/artifacts/beats
@@ -101,6 +149,57 @@ so-elastic-fleet:
- x509: etc_elasticfleet_crt
{% endif %}
{% if GLOBALS.role != "so-fleet" %}
so-elastic-fleet-package-statefile:
file.managed:
- name: /opt/so/state/elastic_fleet_packages.txt
- contents: {{ELASTICFLEETMERGED.packages}}
so-elastic-fleet-package-upgrade:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-package-upgrade
- retry:
attempts: 3
interval: 10
- onchanges:
- file: /opt/so/state/elastic_fleet_packages.txt
so-elastic-fleet-integrations:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-integration-policy-load
- retry:
attempts: 3
interval: 10
so-elastic-agent-grid-upgrade:
cmd.run:
- name: /usr/sbin/so-elastic-agent-grid-upgrade
- retry:
attempts: 12
interval: 5
so-elastic-fleet-integration-upgrade:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-integration-upgrade
- retry:
attempts: 3
interval: 10
{# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
so-elastic-fleet-addon-integrations:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-optional-integrations-load
{% if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
so-elastic-defend-manage-filters-file-watch:
cmd.run:
- name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
- onchanges:
- file: elasticdefendcustom
- file: elasticdefenddisabled
{% endif %}
{% endif %}
delete_so-elastic-fleet_so-status.disabled:
file.uncomment:
- name: /opt/so/conf/so-status/so-status.conf
-112
View File
@@ -1,112 +0,0 @@
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
{% from 'allowed_states.map.jinja' import allowed_states %}
{% if sls in allowed_states %}
{% from 'elasticfleet/map.jinja' import ELASTICFLEETMERGED %}
include:
- elasticfleet.config
# If enabled, automatically update Fleet Logstash Outputs
{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval'] %}
so-elastic-fleet-auto-configure-logstash-outputs:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-outputs-update
- retry:
attempts: 4
interval: 30
{# Separate from above in order to catch elasticfleet-logstash.crt changes and force update to fleet output policy #}
so-elastic-fleet-auto-configure-logstash-outputs-force:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-outputs-update --certs
- retry:
attempts: 4
interval: 30
- onchanges:
- x509: etc_elasticfleet_logstash_crt
- x509: elasticfleet_kafka_crt
{% endif %}
# If enabled, automatically update Fleet Server URLs & ES Connection
so-elastic-fleet-auto-configure-server-urls:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-urls-update
- retry:
attempts: 4
interval: 30
# Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
so-elastic-fleet-auto-configure-elasticsearch-urls:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-es-url-update
- retry:
attempts: 4
interval: 30
so-elastic-fleet-auto-configure-artifact-urls:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-artifacts-url-update
- retry:
attempts: 4
interval: 30
so-elastic-fleet-package-statefile:
file.managed:
- name: /opt/so/state/elastic_fleet_packages.txt
- contents: {{ELASTICFLEETMERGED.packages}}
so-elastic-fleet-package-upgrade:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-package-upgrade
- retry:
attempts: 3
interval: 10
- onchanges:
- file: /opt/so/state/elastic_fleet_packages.txt
so-elastic-fleet-integrations:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-integration-policy-load
- retry:
attempts: 3
interval: 10
so-elastic-agent-grid-upgrade:
cmd.run:
- name: /usr/sbin/so-elastic-agent-grid-upgrade
- retry:
attempts: 12
interval: 5
so-elastic-fleet-integration-upgrade:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-integration-upgrade
- retry:
attempts: 3
interval: 10
{# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
so-elastic-fleet-addon-integrations:
cmd.run:
- name: /usr/sbin/so-elastic-fleet-optional-integrations-load
{% if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
so-elastic-defend-manage-filters-file-watch:
cmd.run:
- name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
- onchanges:
- file: elasticdefendcustom
- file: elasticdefenddisabled
{% endif %}
{% else %}
{{sls}}_state_not_allowed:
test.fail_without_changes:
- name: {{sls}}_state_not_allowed
{% endif %}
+1 -1
View File
@@ -4,7 +4,7 @@
# Elastic License 2.0.
{% from 'allowed_states.map.jinja' import allowed_states %}
{% if sls in allowed_states %}
{% if sls.split('.')[0] in allowed_states %}
{% from 'vars/globals.map.jinja' import GLOBALS %}
{% from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
{% from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS, SO_MANAGED_INDICES %}
+7 -6
View File
@@ -17,7 +17,7 @@ include:
- elasticsearch.ssl
- elasticsearch.config
- elasticsearch.sostatus
{%- if GLOBALS.role != "so-searchode" %}
{%- if GLOBALS.role != 'so-searchode' %}
- elasticsearch.cluster
{%- endif%}
@@ -102,6 +102,11 @@ so-elasticsearch:
- cmd: auth_users_roles_inode
- cmd: auth_users_inode
delete_so-elasticsearch_so-status.disabled:
file.uncomment:
- name: /opt/so/conf/so-status/so-status.conf
- regex: ^so-elasticsearch$
wait_for_so-elasticsearch:
http.wait_for_successful_query:
- name: "https://localhost:9200/"
@@ -112,14 +117,10 @@ wait_for_so-elasticsearch:
- status: 200
- wait_for: 300
- request_interval: 15
- backend: requests
- require:
- docker_container: so-elasticsearch
delete_so-elasticsearch_so-status.disabled:
file.uncomment:
- name: /opt/so/conf/so-status/so-status.conf
- regex: ^so-elasticsearch$
{% else %}
{{sls}}_state_not_allowed:
@@ -103,13 +103,11 @@ load_component_templates() {
local pattern="${ELASTICSEARCH_TEMPLATES_DIR}/component/$2"
local append_mappings="${3:-"false"}"
# current state of nullglob shell option
shopt -q nullglob && nullglob_set=1 || nullglob_set=0
shopt -s nullglob
echo -e "\nLoading $printed_name component templates...\n"
if ! compgen -G "${pattern}/*.json" > /dev/null; then
echo "No $printed_name component templates found in ${pattern}, skipping."
return
fi
for component in "$pattern"/*.json; do
tmpl_name=$(basename "${component%.json}")
@@ -123,6 +121,11 @@ load_component_templates() {
SO_LOAD_FAILURES_NAMES+=("$component")
fi
done
# restore nullglob shell option if needed
if [[ $nullglob_set -eq 1 ]]; then
shopt -u nullglob
fi
}
check_elasticsearch_responsive() {
@@ -133,32 +136,7 @@ check_elasticsearch_responsive() {
fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
}
index_templates_exist() {
local templates_dir="$1"
if [[ ! -d "$templates_dir" ]]; then
return 1
fi
compgen -G "${templates_dir}/*.json" > /dev/null
}
should_load_addon_templates() {
if [[ "$IS_HEAVYNODE" == "true" ]]; then
return 1
fi
# Skip statefile checks when forcing template load
if [[ "$FORCE" != "true" ]]; then
if [[ ! -f "$SO_STATEFILE_SUCCESS" || -f "$ADDON_STATEFILE_SUCCESS" ]]; then
return 1
fi
fi
index_templates_exist "$ADDON_TEMPLATES_DIR"
}
if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_exist "$SO_TEMPLATES_DIR"; then
if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]]; then
check_elasticsearch_responsive
if [[ "$IS_HEAVYNODE" == "false" ]]; then
@@ -223,14 +201,13 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
fail "Failed to load all Security Onion core templates successfully."
fi
fi
elif ! index_templates_exist "$SO_TEMPLATES_DIR"; then
echo "No Security Onion core index templates found in ${SO_TEMPLATES_DIR}, skipping."
elif [[ -f "$SO_STATEFILE_SUCCESS" ]]; then
else
echo "Security Onion core templates already loaded"
fi
# Start loading addon templates
if should_load_addon_templates; then
if [[ (-d "$ADDON_TEMPLATES_DIR" && -f "$SO_STATEFILE_SUCCESS" && "$IS_HEAVYNODE" == "false" && ! -f "$ADDON_STATEFILE_SUCCESS") || (-d "$ADDON_TEMPLATES_DIR" && "$IS_HEAVYNODE" == "false" && "$FORCE" == "true") ]]; then
check_elasticsearch_responsive
+1 -1
View File
@@ -22,7 +22,7 @@ kibana:
- default
- file
migrations:
discardCorruptObjects: "9.3.3"
discardCorruptObjects: "8.18.8"
telemetry:
enabled: False
xpack:
+1 -1
View File
@@ -3,8 +3,8 @@ kratos:
description: Enables or disables the Kratos authentication system. WARNING - Disabling this process will cause the grid to malfunction. Re-enabling this setting will require manual effort via SSH.
forcedType: bool
advanced: True
readonly: True
helpLink: kratos
oidc:
enabled:
description: Set to True to enable OIDC / Single Sign-On (SSO) to SOC. Requires a valid Security Onion license key.
-1
View File
@@ -3,7 +3,6 @@ soc:
description: Enables or disables SOC. WARNING - Disabling this setting is unsupported and will cause the grid to malfunction. Re-enabling this setting is a manual effort via SSH.
forcedType: bool
advanced: True
readonly: True
telemetryEnabled:
title: SOC Telemetry
description: When this setting is enabled and the grid is not in airgap mode, SOC will provide feature usage data to the Security Onion development team via Google Analytics. This data helps Security Onion developers determine which product features are being used and can also provide insight into improving the user interface. When changing this setting, wait for the grid to fully synchronize and then perform a hard browser refresh on SOC, to force the browser cache to update and reflect the new setting.
+16 -3
View File
@@ -1550,6 +1550,19 @@ reinstall_init() {
local service_retry_count=20
{
# Snapshot pre-reinstall salt state before any destructive step so a
# failed reinstall leaves a usable post-mortem in the setup log.
echo "=== pre-reinstall salt diagnostic $(date -Iseconds) ==="
systemctl status salt-master --no-pager 2>&1 | head -40 || true
systemctl status salt-minion --no-pager 2>&1 | head -40 || true
journalctl -u salt-master --no-pager --since "-10 minutes" 2>&1 | tail -80 || true
journalctl -u salt-minion --no-pager --since "-10 minutes" 2>&1 | tail -80 || true
ls -laR /etc/salt 2>&1 | head -60 || true
ls -la /var/cache/salt 2>&1 | head -40 || true
[[ -f /etc/salt/master.rpmnew ]] && diff -u /etc/salt/master /etc/salt/master.rpmnew 2>&1 | head -80 || true
[[ -f /etc/salt/minion.rpmnew ]] && diff -u /etc/salt/minion /etc/salt/minion.rpmnew 2>&1 | head -40 || true
echo "=== end diagnostic ==="
# remove all of root's cronjobs
crontab -r -u root
@@ -1580,14 +1593,14 @@ reinstall_init() {
kill -9 $pid
fail_setup
fi
sleep 5
((count++))
done
done
# Remove all salt configs
rm -rf /etc/salt/engines/* /etc/salt/grains /etc/salt/master /etc/salt/master.d/* /etc/salt/minion /etc/salt/minion.d/* /etc/salt/pki/* /etc/salt/proxy /etc/salt/proxy.d/* /var/cache/salt/
# Uninstall salt so configs and directories are removed and reinstall reconfigures directories
dnf -y remove salt
if command -v docker &> /dev/null; then
# Stop and remove all so-* containers so files can be changed with more safety
+12 -4
View File
@@ -219,7 +219,7 @@ if [ -n "$test_profile" ]; then
WEBUSER=onionuser@somewhere.invalid
WEBPASSWD1=0n10nus3r
WEBPASSWD2=0n10nus3r
NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MSRVIP_OFFSET}"
NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MAINIP}"
update_sudoers_for_testing
fi
@@ -724,10 +724,18 @@ if ! [[ -f $install_opt_file ]]; then
# Install salt
saltify
check_sos_appliance
# Wait for salt-master to be actually running and have its PKI
# ready after a fresh saltify. Without this, salt-key operations
# silently race the daemon and the key accept no-ops, which is
# what was causing reinstalls on 3.x to hang on state.show_top.
retry 30 2 "test -f /etc/salt/pki/master/master.pub" \
|| fail "salt-master did not initialize PKI after saltify"
check_salt_master_status \
|| fail "salt-master not accepting calls after saltify"
logCmd "salt-key -yd $MINION_ID"
sleep 2 # Debug RSA Key format errors
logCmd "salt-call state.show_top"
sleep 2 # Debug RSA Key format errors
wait_for_minion_key_pending "$MINION_ID" 30 2 \
|| fail "salt-minion never presented its key to salt-master"
logCmd "salt-key -ya $MINION_ID"
logCmd "salt-call saltutil.sync_all"
# we need to sync the runner and generate the soqemussh user keys so that first highstate after license created