Merge remote-tracking branch 'origin/3/dev' into jertel/wip

2026-06-22 10:18:09 +02:00 · 2026-05-27 09:18:33 -04:00
parent e45ad45d73 ffd34d4e0e
commit 16055c4d88
8 changed files with 246 additions and 24 deletions
@@ -166,6 +166,7 @@ if [[ $EXCLUDE_FALSE_POSITIVE_ERRORS == 'Y' ]]; then
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|upgrading composable template" # false positive (elasticsearch composable template names contain 'error')
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Error while parsing document for index \[.ds-logs-kratos-so-.*object mapping for \[file\]" # false positive (mapping error occuring BEFORE kratos index has rolled over in 2.4.210)
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|No such container"            # false positive (telegraf trying to run stats on an old container)
+    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|passwords do not match"       # false positive (automated hydra test)
 fi

 if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then
@@ -26,7 +26,9 @@ include:
 wait_for_elasticsearch_elasticfleet:
  cmd.run:
    - name: so-elasticsearch-wait
+{% endif %}

+{% if GLOBALS.role == "so-fleet" %}
 # Sync Elastic Agent artifacts to Fleet Node
 elasticagent_syncartifacts:
  file.recurse:
@@ -971,6 +971,9 @@ verify_es_version_compatibility() {
    local is_active_intermediate_upgrade=1
    # supported upgrade paths for SO-ES versions
    declare -A es_upgrade_map=(
+        ["8.18.4"]="8.18.6 8.18.8 9.0.8"
+	    ["8.18.6"]="8.18.8 9.0.8"
+	    ["8.18.8"]="9.0.8"
        ["9.0.8"]="9.3.3"
    )

@@ -994,6 +997,171 @@ verify_es_version_compatibility() {
        exit 160
    fi

+    compatible_es_versions="$target_es_version"
+    for current_version in "${!es_upgrade_map[@]}"; do
+        # shellcheck disable=SC2076
+        if [[ " ${es_upgrade_map[$current_version]} " =~ " $target_es_version " ]]; then
+            compatible_es_versions+=" $current_version"
+        fi
+    done
+
+    # Check if the given ES version can directly upgrade to the target ES version. Used to assist with catching lagging nodes during the upgrade process
+    es_version_can_upgrade_to_target() {
+        local current_version="$1"
+        # shellcheck disable=SC2076
+        if [[ -n "$current_version" && " $compatible_es_versions " =~ " $current_version " ]]; then
+            return 0
+        fi
+
+        return 1
+    }
+
+    # Gather Elasticsearch cluster version info and verify that each node in the cluster is running a version compatible with the target ES version.
+    verify_searchnodes_es_target_compatibility() {
+        local retries=20
+        local retry_count=0
+        local delay=180
+        local expected_es_nodes searchnode_minions attempt
+        local searchnode_discovery_success=false
+        SEARCHNODE_ES_VERSIONS=""
+
+        for attempt in {1..3}; do
+            if searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then
+                searchnode_discovery_success=true
+                break
+            fi
+
+            echo "Failed to retrieve grid searchnodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3."
+            sleep 30
+        done
+
+        if [[ "$searchnode_discovery_success" != "true" ]]; then
+            echo "Failed to retrieve grid searchnodes via salt-key."
+            return 1
+        fi
+
+        # Always add node running soup to expected es nodes
+        expected_es_nodes="${MINIONID%_*}"
+        while IFS= read -r searchnode_minion; do
+            [[ -z "$searchnode_minion" ]] && continue
+            expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}"
+        done <<< "$searchnode_minions"
+
+        while [[ $retry_count -lt $retries ]]; do
+            SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1)
+            local exit_status=$?
+
+            if [[ $exit_status -ne 0 ]]; then
+                echo "Failed to retrieve Elasticsearch versions from searchnodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+                ((retry_count++))
+                sleep $delay
+                continue
+            fi
+
+            local all_searchnodes_compatible=true
+            while IFS=$'\t' read -r node current_version; do
+                [[ -z "$node" ]] && continue
+                if ! es_version_can_upgrade_to_target "$current_version"; then
+                    echo "Searchnode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version."
+                    all_searchnodes_compatible=false
+                fi
+            done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv')
+
+            while IFS= read -r expected_es_node; do
+                [[ -z "$expected_es_node" ]] && continue
+                if ! echo "$SEARCHNODE_ES_VERSIONS" | jq -e --arg node "$expected_es_node" '.nodes | to_entries | any(.value.name == $node)' > /dev/null; then
+                    echo "Searchnode $expected_es_node did not report an Elasticsearch version. It may be offline or still upgrading."
+                    all_searchnodes_compatible=false
+                fi
+            done <<< "$expected_es_nodes"
+
+            if [[ "$all_searchnodes_compatible" == true ]]; then
+                echo "All Searchnodes are upgradable to Elasticsearch $target_es_version."
+                return 0
+            fi
+
+            echo "One or more Searchnodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+            ((retry_count++))
+            sleep $delay
+        done
+
+        return 1
+    }
+
+    # Gather heavynode version info and verify that each node is running a version compatible with the target ES version.
+    verify_heavynodes_es_target_compatibility() {
+        local heavynode_minions attempt
+        local retries=20
+        local retry_count=0
+        local delay=180
+        local heavynode_discovery_success=false
+        HEAVYNODE_ES_VERSIONS=""
+
+        for attempt in {1..3}; do
+            if heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then
+                heavynode_discovery_success=true
+                break
+            fi
+
+            echo "Failed to retrieve grid heavynodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3."
+            sleep 30
+        done
+
+        if [[ "$heavynode_discovery_success" != "true" ]]; then
+            echo "Failed to retrieve grid heavynodes via salt-key."
+            return 1
+        fi
+
+        if [[ -z "$heavynode_minions" ]]; then
+            echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check."
+            return 0
+        fi
+
+        while [[ $retry_count -lt $retries ]]; do
+            HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null)
+            local exit_status=$?
+
+            if [[ $exit_status -ne 0 ]]; then
+                echo "Failed to retrieve Elasticsearch version from one or more heavynodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+                ((retry_count++))
+                sleep $delay
+                continue
+            fi
+
+            local all_heavynodes_compatible=true
+            while IFS=$'\t' read -r node current_version; do
+                [[ -z "$node" ]] && continue
+                if ! es_version_can_upgrade_to_target "$current_version"; then
+                    echo "Heavynode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version."
+                    all_heavynodes_compatible=false
+                fi
+            done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv')
+
+            while IFS= read -r heavynode_minion; do
+                [[ -z "$heavynode_minion" ]] && continue
+                if ! echo "$HEAVYNODE_ES_VERSIONS" | jq -e --arg minion "$heavynode_minion" 'has($minion)' > /dev/null; then
+                    echo "Heavynode $heavynode_minion did not report an Elasticsearch version. It may be offline or still upgrading."
+                    all_heavynodes_compatible=false
+                fi
+            done <<< "$heavynode_minions"
+
+            if [[ "$all_heavynodes_compatible" == true ]]; then
+                echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version."
+                return 0
+            fi
+
+            echo "One or more heavynodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries."
+            ((retry_count++))
+            sleep $delay
+        done
+
+        return 1
+    }
+
+    if [[ ! -f "$es_verification_script" ]]; then
+        create_intermediate_upgrade_verification_script "$es_verification_script"
+    fi
+
    for statefile in "${es_required_version_statefile_base}"-*; do
        [[ -f $statefile ]] || continue

@@ -1012,10 +1180,6 @@ verify_es_version_compatibility() {
            continue
        fi

-        if [[ ! -f "$es_verification_script" ]]; then
-            create_intermediate_upgrade_verification_script "$es_verification_script"
-        fi
-
        echo -e "\n##############################################################################################################################\n"
        echo "A previously required intermediate Elasticsearch upgrade was detected. Verifying that all Searchnodes/Heavynodes have successfully upgraded Elasticsearch to $es_required_version_statefile_value before proceeding with soup to avoid potential data loss! This command can take up to an hour to complete."
        if ! timeout --foreground 4000 bash "$es_verification_script" "$es_required_version_statefile_value" "$statefile"; then
@@ -1037,6 +1201,26 @@ verify_es_version_compatibility() {

    # shellcheck disable=SC2076 # Do not want a regex here eg usage " 8.18.8 9.0.8 " =~ " 9.0.8 "
    if [[ " ${es_upgrade_map[$es_version]} " =~ " $target_es_version " || "$es_version" == "$target_es_version" ]]; then
+        if ! verify_searchnodes_es_target_compatibility || ! verify_heavynodes_es_target_compatibility; then
+            echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
+
+            echo "One or more Searchnode(s)/Heavynode(s) cannot upgrade directly to Elasticsearch $target_es_version. This can happen with soups that include Elasticsearch upgrades being run in quick succession. Typically, this will resolve itself as the grid synchronizes. Please allow time for all Searchnodes/Heavynodes to have upgraded Elasticsearch to a compatible version with $target_es_version before running soup again to avoid potential data loss!"
+
+            if [[ -n "$HEAVYNODE_ES_VERSIONS" ]]; then
+                echo "Current heavynode Elasticsearch versions:"
+                echo "$HEAVYNODE_ES_VERSIONS" | jq '.'
+            fi
+
+            if [[ -n "$SEARCHNODE_ES_VERSIONS" ]]; then
+                echo "Current searchnode Elasticsearch versions:"
+                echo "$SEARCHNODE_ES_VERSIONS" | jq '.nodes | to_entries | map({(.value.name): .value.version}) | sort | add'
+            fi
+
+            echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
+
+            exit 161
+        fi
+
        # supported upgrade
        return 0
    else
@@ -1394,7 +1578,7 @@ main() {
  echo "Verifying we have the latest soup script."
  verify_latest_update_script

-  echo "Verifying Elasticsearch version compatibility before upgrading."
+  echo "Verifying Elasticsearch version compatibility across the grid before upgrading."
  verify_es_version_compatibility

  echo "Let's see if we need to update Security Onion."
@@ -7,15 +7,29 @@

 . /usr/sbin/so-common

+# Without pipefail, a pipeline's exit status is gzip's. A failed pg_dumpall would
+# otherwise be masked by a successful gzip, silently producing a valid .gz that
+# holds a truncated dump.
+set -o pipefail
+
 # Backups contain role password hashes and full chat data; keep them 0600.
 umask 0077

 TODAY=$(date '+%Y_%m_%d')
 BACKUPDIR=/nsm/backup
 BACKUPFILE="$BACKUPDIR/so-postgres-backup-$TODAY.sql.gz"
+TMPFILE="$BACKUPFILE.tmp"
 MAXBACKUPS=7
+LOGFILE=/opt/so/log/postgres/backup.log

-mkdir -p $BACKUPDIR
+log() {
+  echo "$(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOGFILE"
+}
+
+mkdir -p "$BACKUPDIR"
+
+# Remove any temp files left behind by a previously crashed run
+rm -f "$BACKUPDIR"/so-postgres-backup-*.sql.gz.tmp

 # Skip if already backed up today
 if [ -f "$BACKUPFILE" ]; then
@@ -27,13 +41,33 @@ if ! docker ps --format '{{.Names}}' | grep -q '^so-postgres$'; then
  exit 0
 fi

-# Dump all databases and roles, compress
-docker exec so-postgres pg_dumpall -U postgres | gzip > "$BACKUPFILE"
+# Always clean up the temp file on exit; the success path clears this trap
+# after the atomic rename so the finished backup is not deleted.
+trap 'rm -f "$TMPFILE"' EXIT

-# Retention cleanup
-NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
+# Dump all databases and roles, compress. Write to a temp file so the final
+# filename only ever appears for a complete, verified backup.
+if ! docker exec so-postgres pg_dumpall -U postgres | gzip > "$TMPFILE"; then
+  log "ERROR: pg_dumpall/gzip failed; backup aborted"
+  exit 1
+fi
+
+# Verify the compressed stream is intact before publishing it
+if ! gzip -t "$TMPFILE"; then
+  log "ERROR: backup failed gzip integrity check; backup aborted"
+  exit 1
+fi
+
+# Atomically publish the verified backup
+mv "$TMPFILE" "$BACKUPFILE"
+trap - EXIT
+log "OK: wrote $BACKUPFILE"
+
+# Retention cleanup (only reached after a successful backup). The glob is
+# restricted to finished backups so an in-progress .tmp can never be counted.
+NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l)
 while [ "$NUMBACKUPS" -gt "$MAXBACKUPS" ]; do
-  OLDEST=$(find $BACKUPDIR -type f -name "so-postgres-backup*" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
+  OLDEST=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
  rm -f "$OLDEST"
-  NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
+  NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l)
 done