Merge pull request #9603 from Security-Onion-Solutions/kilo

Handle setup failures
2025-12-06 09:12:45 +01:00 · 2023-01-19 15:49:41 -05:00
parent 027c83b5ea 79fb5dc525
commit 4044706cd9
4 changed files with 121 additions and 19 deletions
--- a/salt/common/tools/sbin/so-common
+++ b/salt/common/tools/sbin/so-common
@@ -225,31 +225,17 @@ init_monitor() {
 }

 is_manager_node() {
-	# Check to see if this is a manager node
-	role=$(lookup_role)
-	is_single_node_grid && return 0
-	[ $role == 'manager' ] && return 0
-	[ $role == 'managersearch' ] && return 0
-	[ $role == 'helix' ] && return 0
-	return 1
+	grep "role: so-" /etc/salt/grains | grep -E "manager|eval|managersearch|standalone|import" &> /dev/null
 }

 is_sensor_node() {
 	# Check to see if this is a sensor (forward) node
-	role=$(lookup_role)
 	is_single_node_grid && return 0
-	[ $role == 'sensor' ] && return 0
-	[ $role == 'heavynode' ] && return 0
-	[ $role == 'helix' ] && return 0
-	return 1
+	grep "role: so-" /etc/salt/grains | grep -E "sensor|heavynode|helix" &> /dev/null
 }

 is_single_node_grid() {
-	role=$(lookup_role)
-	[ $role == 'eval' ] && return 0
-	[ $role == 'standalone' ] && return 0
-	[ $role == 'import' ] && return 0
-	return 1
+	grep "role: so-" /etc/salt/grains | grep -E "eval|standalone|import" &> /dev/null
 }

 lookup_bond_interfaces() {
--- a/salt/common/tools/sbin/so-grafana-dashboard-folder-delete
+++ b/salt/common/tools/sbin/so-grafana-dashboard-folder-delete
@@ -1,5 +1,8 @@
 # this script is used to delete the default Grafana dashboard folders that existed prior to Grafana dashboard and Salt management changes in 2.3.70

+# Exit if an error occurs. The next highstate will retry.
+set -e
+
 folders=$(curl -X GET http://admin:{{salt['pillar.get']('secrets:grafana_admin')}}@localhost:3000/api/folders | jq -r '.[] | @base64')
 delfolder=("Manager" "Manager Search" "Sensor Nodes" "Search Nodes" "Standalone" "Eval Mode")

--- a/setup/so-setup
+++ b/setup/so-setup
@@ -585,7 +585,7 @@ if ! [[ -f $install_opt_file ]]; then
 		fi
 		checkin_at_boot
 		set_initial_firewall_access
-		whiptail_setup_complete
+		./so-verify $setup_type
 	else
 		touch /root/accept_changes
 		mkdir -p /opt/so
@@ -608,7 +608,7 @@ if ! [[ -f $install_opt_file ]]; then
 		configure_minion "$minion_type"
 		drop_install_options
 		checkin_at_boot
-		whiptail_setup_complete
+		./so-verify $setup_type
 	fi

 	# Need to make sure the latest install is located on the web server of the manager to check the versions and donwload the code if required
--- a/setup/so-verify
+++ b/setup/so-verify
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+cd "$(dirname "$0")" || exit 255
+
+source ../salt/common/tools/sbin/so-common
+source ./so-functions
+source ./so-whiptail
+source ./so-variables
+
+setup_type=$1
+
+setup_in_progress() {
+    ps -ef | grep so-setup | grep -v grep &> /dev/null
+}
+
+using_iso() {
+    if [ "$setup_type" == "iso" ]; then
+        return 0
+    fi
+    return 0
+}
+
+whipit() {
+	if [[ $exit_code -eq 0 ]]; then
+		whiptail_setup_complete
+	else
+		whiptail_setup_failed
+	fi
+}
+
+# Check entire setup log for errors or unexpected salt states
+log_has_errors() {
+    # Ignore salt mast cached public key and minion failed to auth because this is a test 
+    # to see if the salt key had already been accepted.
+
+    # Ignore failed to connect to ::1 since we have most curls wrapped in a retry.
+
+    # Ignore perl-Error- since that is the name of a Perl package SO installs.
+
+    # Ignore Failed: 0 since that is the salt state output, and we detect state failures
+    # via Result: False already.
+    
+    grep -E "FAILED|Failed|failed|ERROR|Error|Result: False" "$setup_log" | \
+        grep -vE "The Salt Master has cached the public key for this node" | \
+        grep -vE "Minion failed to authenticate with the master" | \
+        grep -vE "Failed to connect to ::1" | \
+        grep -vE "perl-Error-" | \
+        grep -vE "Failed:\s*?[0-9]+" | \
+        grep -vE "Status .* was not found" | \
+        grep -vE "Uncaught exception, closing connection" | \
+        grep -vE "Exception in callback None" | \
+        grep -vE "deprecation: ERROR" | \
+        grep -vE "code: 100" | \
+        grep -vE "Running scope as unit" &> "$error_log"
+    
+    if [[ $? -eq 0 ]]; then
+        return 0
+    fi
+    return 1
+}
+
+# For ISO installs, we know nothing else can be running on this server, so there should be
+# nothing in any mail spool dir.
+cron_error_in_mail_spool() {
+    if find /var/spool/mail/ -type f -size +0 &> /dev/null; then
+        return 0
+    fi
+    return 1
+}
+
+# so-setup must return a 0 exit code, indicating all containers are up and healthy. Will retry for a limited
+# time before giving up.
+status_failed() {
+    max_retries=120
+    wait_secs=10
+    retry_attempts=0
+    while ! so-status -q; do
+        if [[ $retry_attempts -eq $max_retries ]]; then
+            return 0
+        fi
+        retry_attempts=$((retry_attempts+1))
+        echo "INFO: so-status returned non-zero exit code; will retry in $wait_secs seconds ($retry_attempts/$max_retries)"
+        sleep $wait_secs
+    done
+    return 1
+}
+
+main() {
+    exit_code=0
+    if log_has_errors; then 
+    	echo "WARNING: Errors detected during setup"
+        exit_code=1
+    elif using_iso && cron_error_in_mail_spool; then
+    	echo "WARNING: Unexpected cron job output in mail spool"
+        exit_code=1
+    elif is_manager_node && status_failed; then
+        echo "WARNING: Containers are not in a healthy state"
+        exit_code=1
+    else
+	    echo "Successfully completed setup!"
+	fi
+
+    setup_in_progress && whipit $exit_code
+
+    exit $exit_code
+}
+
+main