From 6c8997b28a7b1998a553dc3c87c9168a0a8aab06 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Tue, 19 May 2026 22:27:31 -0500 Subject: [PATCH 1/4] verify all heavynodes and all searchnodes are at compatible ES version before attempting an elasticsearch upgrade --- salt/manager/tools/sbin/soup | 139 +++++++++++++++++++++++++++++++++-- 1 file changed, 134 insertions(+), 5 deletions(-) diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index bd3048019..e6a14607e 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -971,6 +971,9 @@ verify_es_version_compatibility() { local is_active_intermediate_upgrade=1 # supported upgrade paths for SO-ES versions declare -A es_upgrade_map=( + ["8.18.4"]="8.18.6 8.18.8 9.0.8" + ["8.18.6"]="8.18.8 9.0.8" + ["8.18.8"]="9.0.8" ["9.0.8"]="9.3.3" ) @@ -994,6 +997,116 @@ verify_es_version_compatibility() { exit 160 fi + compatible_es_versions="$target_es_version" + for current_version in "${!es_upgrade_map[@]}"; do + # shellcheck disable=SC2076 + if [[ " ${es_upgrade_map[$current_version]} " =~ " $target_es_version " ]]; then + compatible_es_versions+=" $current_version" + fi + done + + # Check if the given ES version can directly upgrade to the target ES version. Used to assist with catching lagging nodes during the upgrade process + es_version_can_upgrade_to_target() { + local current_version="$1" + # shellcheck disable=SC2076 + if [[ -n "$current_version" && " $compatible_es_versions " =~ " $current_version " ]]; then + return 0 + fi + + return 1 + } + + # Gather Elasticsearch cluster version info and verify that each node in the cluster is running a version compatible with the target ES version. + verify_searchnodes_es_target_compatibility() { + local retries=20 + local retry_count=0 + local delay=180 + SEARCHNODE_ES_VERSIONS="" + + while [[ $retry_count -lt $retries ]]; do + SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1) + local exit_status=$? + + if [[ $exit_status -ne 0 ]]; then + echo "Failed to retrieve Elasticsearch versions from searchnodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + local all_searchnodes_compatible=true + while IFS=$'\t' read -r node current_version; do + [[ -z "$node" ]] && continue + if ! es_version_can_upgrade_to_target "$current_version"; then + echo "Searchnode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version." + all_searchnodes_compatible=false + fi + done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv') + + if [[ "$all_searchnodes_compatible" == true ]]; then + echo "All Searchnodes are upgradable to Elasticsearch $target_es_version." + return 0 + fi + + echo "One or more Searchnodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + done + + return 1 + } + + # Gather heavynode version info and verify that each node is running a version compatible with the target ES version. + verify_heavynodes_es_target_compatibility() { + if ! salt-key -l accepted | grep -q 'heavynode$'; then + echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." + return 0 + fi + + echo -e "\nOne or more heavynodes detected. Verifying each is running an Elasticsearch version that is compatible with $target_es_version." + + local retries=20 + local retry_count=0 + local delay=180 + HEAVYNODE_ES_VERSIONS="" + + while [[ $retry_count -lt $retries ]]; do + HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -r ".version.number"' shell=/bin/bash --out=json 2> /dev/null) + local exit_status=$? + + if [[ $exit_status -ne 0 ]]; then + echo "Failed to retrieve Elasticsearch version from one or more heavynodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + local all_heavynodes_compatible=true + while IFS=$'\t' read -r node current_version; do + [[ -z "$node" ]] && continue + if ! es_version_can_upgrade_to_target "$current_version"; then + echo "Heavynode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version." + all_heavynodes_compatible=false + fi + done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv') + + if [[ "$all_heavynodes_compatible" == true ]]; then + echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version." + return 0 + fi + + echo "One or more heavynodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + done + + return 1 + } + + if [[ ! -f "$es_verification_script" ]]; then + create_intermediate_upgrade_verification_script "$es_verification_script" + fi + for statefile in "${es_required_version_statefile_base}"-*; do [[ -f $statefile ]] || continue @@ -1012,10 +1125,6 @@ verify_es_version_compatibility() { continue fi - if [[ ! -f "$es_verification_script" ]]; then - create_intermediate_upgrade_verification_script "$es_verification_script" - fi - echo -e "\n##############################################################################################################################\n" echo "A previously required intermediate Elasticsearch upgrade was detected. Verifying that all Searchnodes/Heavynodes have successfully upgraded Elasticsearch to $es_required_version_statefile_value before proceeding with soup to avoid potential data loss! This command can take up to an hour to complete." if ! timeout --foreground 4000 bash "$es_verification_script" "$es_required_version_statefile_value" "$statefile"; then @@ -1037,6 +1146,26 @@ verify_es_version_compatibility() { # shellcheck disable=SC2076 # Do not want a regex here eg usage " 8.18.8 9.0.8 " =~ " 9.0.8 " if [[ " ${es_upgrade_map[$es_version]} " =~ " $target_es_version " || "$es_version" == "$target_es_version" ]]; then + if ! verify_searchnodes_es_target_compatibility || ! verify_heavynodes_es_target_compatibility; then + echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + + echo "One or more Searchnode(s)/Heavynode(s) cannot upgrade directly to Elasticsearch $target_es_version. This can happen with soups that include Elasticsearch upgrades being run in quick succession. Typically, this will resolve itself as the grid synchronizes. Please allow time for all Searchnodes/Heavynodes to have upgraded Elasticsearch to a compatible version with $target_es_version before running soup again to avoid potential data loss!" + + if [[ -n "$HEAVYNODE_ES_VERSIONS" ]]; then + echo "Current heavynode Elasticsearch versions:" + echo "$HEAVYNODE_ES_VERSIONS" | jq '.' + fi + + if [[ -n "$SEARCHNODE_ES_VERSIONS" ]]; then + echo "Current searchnode Elasticsearch versions:" + echo "$SEARCHNODE_ES_VERSIONS" | jq '.nodes | to_entries | map({(.value.name): .value.version}) | sort | add' + fi + + echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + + exit 161 + fi + # supported upgrade return 0 else @@ -1394,7 +1523,7 @@ main() { echo "Verifying we have the latest soup script." verify_latest_update_script - echo "Verifying Elasticsearch version compatibility before upgrading." + echo "Verifying Elasticsearch version compatibility across the grid before upgrading." verify_es_version_compatibility echo "Let's see if we need to update Security Onion." From d7a1b67095f630b256b0fc716e24b70ab1d39c13 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Wed, 20 May 2026 09:16:57 -0500 Subject: [PATCH 2/4] use pipefail on heavynode versino command to pass through error --- salt/manager/tools/sbin/soup | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index e6a14607e..d21599ad8 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -1071,7 +1071,7 @@ verify_es_version_compatibility() { HEAVYNODE_ES_VERSIONS="" while [[ $retry_count -lt $retries ]]; do - HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -r ".version.number"' shell=/bin/bash --out=json 2> /dev/null) + HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null) local exit_status=$? if [[ $exit_status -ne 0 ]]; then From 7d13007aa9f364eebece1ecae9c561bda6e3c2ed Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Wed, 20 May 2026 10:03:37 -0500 Subject: [PATCH 3/4] block soup if all ES nodes are not online and reporting their ES version for compatibility check --- salt/manager/tools/sbin/soup | 53 ++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index d21599ad8..8a68e5242 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -1021,9 +1021,24 @@ verify_es_version_compatibility() { local retries=20 local retry_count=0 local delay=180 + local expected_es_nodes + local searchnode_minions SEARCHNODE_ES_VERSIONS="" while [[ $retry_count -lt $retries ]]; do + if ! searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then + echo "Failed to retrieve grid searchnodes via salt-key... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + # Always add node running soup to expected es nodes + expected_es_nodes="${MINIONID%_*}" + while IFS= read -r searchnode_minion; do + [[ -z "$searchnode_minion" ]] && continue + expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}" + done <<< "$searchnode_minions" + SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1) local exit_status=$? @@ -1043,6 +1058,14 @@ verify_es_version_compatibility() { fi done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv') + while IFS= read -r expected_es_node; do + [[ -z "$expected_es_node" ]] && continue + if ! echo "$SEARCHNODE_ES_VERSIONS" | jq -e --arg node "$expected_es_node" '.nodes | to_entries | any(.value.name == $node)' > /dev/null; then + echo "Searchnode $expected_es_node did not report an Elasticsearch version. It may be offline or still upgrading." + all_searchnodes_compatible=false + fi + done <<< "$expected_es_nodes" + if [[ "$all_searchnodes_compatible" == true ]]; then echo "All Searchnodes are upgradable to Elasticsearch $target_es_version." return 0 @@ -1056,21 +1079,27 @@ verify_es_version_compatibility() { return 1 } - # Gather heavynode version info and verify that each node is running a version compatible with the target ES version. + # Gather heavynode version info and verify that each node is running a version compatible with the target ES version. verify_heavynodes_es_target_compatibility() { - if ! salt-key -l accepted | grep -q 'heavynode$'; then - echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." - return 0 - fi - - echo -e "\nOne or more heavynodes detected. Verifying each is running an Elasticsearch version that is compatible with $target_es_version." - + local heavynode_minions local retries=20 local retry_count=0 local delay=180 HEAVYNODE_ES_VERSIONS="" while [[ $retry_count -lt $retries ]]; do + if ! heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then + echo "Failed to retrieve grid heavynodes via salt-key... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + if [[ -z "$heavynode_minions" ]]; then + echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." + return 0 + fi + HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null) local exit_status=$? @@ -1090,6 +1119,14 @@ verify_es_version_compatibility() { fi done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv') + while IFS= read -r heavynode_minion; do + [[ -z "$heavynode_minion" ]] && continue + if ! echo "$HEAVYNODE_ES_VERSIONS" | jq -e --arg minion "$heavynode_minion" 'has($minion)' > /dev/null; then + echo "Heavynode $heavynode_minion did not report an Elasticsearch version. It may be offline or still upgrading." + all_heavynodes_compatible=false + fi + done <<< "$heavynode_minions" + if [[ "$all_heavynodes_compatible" == true ]]; then echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version." return 0 From b485be460204202628a5b2e7d349c42c803add5b Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Wed, 20 May 2026 14:12:58 -0500 Subject: [PATCH 4/4] separate salt-key command from main es version compatiblity loop --- salt/manager/tools/sbin/soup | 72 ++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index 8a68e5242..3bec13716 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -1021,24 +1021,33 @@ verify_es_version_compatibility() { local retries=20 local retry_count=0 local delay=180 - local expected_es_nodes - local searchnode_minions + local expected_es_nodes searchnode_minions attempt + local searchnode_discovery_success=false SEARCHNODE_ES_VERSIONS="" - while [[ $retry_count -lt $retries ]]; do - if ! searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then - echo "Failed to retrieve grid searchnodes via salt-key... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." - ((retry_count++)) - sleep $delay - continue + for attempt in {1..3}; do + if searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then + searchnode_discovery_success=true + break fi - # Always add node running soup to expected es nodes - expected_es_nodes="${MINIONID%_*}" - while IFS= read -r searchnode_minion; do - [[ -z "$searchnode_minion" ]] && continue - expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}" - done <<< "$searchnode_minions" + echo "Failed to retrieve grid searchnodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3." + sleep 30 + done + + if [[ "$searchnode_discovery_success" != "true" ]]; then + echo "Failed to retrieve grid searchnodes via salt-key." + return 1 + fi + + # Always add node running soup to expected es nodes + expected_es_nodes="${MINIONID%_*}" + while IFS= read -r searchnode_minion; do + [[ -z "$searchnode_minion" ]] && continue + expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}" + done <<< "$searchnode_minions" + + while [[ $retry_count -lt $retries ]]; do SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1) local exit_status=$? @@ -1081,25 +1090,34 @@ verify_es_version_compatibility() { # Gather heavynode version info and verify that each node is running a version compatible with the target ES version. verify_heavynodes_es_target_compatibility() { - local heavynode_minions + local heavynode_minions attempt local retries=20 local retry_count=0 local delay=180 + local heavynode_discovery_success=false HEAVYNODE_ES_VERSIONS="" + for attempt in {1..3}; do + if heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then + heavynode_discovery_success=true + break + fi + + echo "Failed to retrieve grid heavynodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3." + sleep 30 + done + + if [[ "$heavynode_discovery_success" != "true" ]]; then + echo "Failed to retrieve grid heavynodes via salt-key." + return 1 + fi + + if [[ -z "$heavynode_minions" ]]; then + echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." + return 0 + fi + while [[ $retry_count -lt $retries ]]; do - if ! heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then - echo "Failed to retrieve grid heavynodes via salt-key... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." - ((retry_count++)) - sleep $delay - continue - fi - - if [[ -z "$heavynode_minions" ]]; then - echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." - return 0 - fi - HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null) local exit_status=$?