diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index bd3048019..e6a14607e 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -971,6 +971,9 @@ verify_es_version_compatibility() { local is_active_intermediate_upgrade=1 # supported upgrade paths for SO-ES versions declare -A es_upgrade_map=( + ["8.18.4"]="8.18.6 8.18.8 9.0.8" + ["8.18.6"]="8.18.8 9.0.8" + ["8.18.8"]="9.0.8" ["9.0.8"]="9.3.3" ) @@ -994,6 +997,116 @@ verify_es_version_compatibility() { exit 160 fi + compatible_es_versions="$target_es_version" + for current_version in "${!es_upgrade_map[@]}"; do + # shellcheck disable=SC2076 + if [[ " ${es_upgrade_map[$current_version]} " =~ " $target_es_version " ]]; then + compatible_es_versions+=" $current_version" + fi + done + + # Check if the given ES version can directly upgrade to the target ES version. Used to assist with catching lagging nodes during the upgrade process + es_version_can_upgrade_to_target() { + local current_version="$1" + # shellcheck disable=SC2076 + if [[ -n "$current_version" && " $compatible_es_versions " =~ " $current_version " ]]; then + return 0 + fi + + return 1 + } + + # Gather Elasticsearch cluster version info and verify that each node in the cluster is running a version compatible with the target ES version. + verify_searchnodes_es_target_compatibility() { + local retries=20 + local retry_count=0 + local delay=180 + SEARCHNODE_ES_VERSIONS="" + + while [[ $retry_count -lt $retries ]]; do + SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1) + local exit_status=$? + + if [[ $exit_status -ne 0 ]]; then + echo "Failed to retrieve Elasticsearch versions from searchnodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + local all_searchnodes_compatible=true + while IFS=$'\t' read -r node current_version; do + [[ -z "$node" ]] && continue + if ! es_version_can_upgrade_to_target "$current_version"; then + echo "Searchnode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version." + all_searchnodes_compatible=false + fi + done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv') + + if [[ "$all_searchnodes_compatible" == true ]]; then + echo "All Searchnodes are upgradable to Elasticsearch $target_es_version." + return 0 + fi + + echo "One or more Searchnodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + done + + return 1 + } + + # Gather heavynode version info and verify that each node is running a version compatible with the target ES version. + verify_heavynodes_es_target_compatibility() { + if ! salt-key -l accepted | grep -q 'heavynode$'; then + echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." + return 0 + fi + + echo -e "\nOne or more heavynodes detected. Verifying each is running an Elasticsearch version that is compatible with $target_es_version." + + local retries=20 + local retry_count=0 + local delay=180 + HEAVYNODE_ES_VERSIONS="" + + while [[ $retry_count -lt $retries ]]; do + HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -r ".version.number"' shell=/bin/bash --out=json 2> /dev/null) + local exit_status=$? + + if [[ $exit_status -ne 0 ]]; then + echo "Failed to retrieve Elasticsearch version from one or more heavynodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + local all_heavynodes_compatible=true + while IFS=$'\t' read -r node current_version; do + [[ -z "$node" ]] && continue + if ! es_version_can_upgrade_to_target "$current_version"; then + echo "Heavynode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version." + all_heavynodes_compatible=false + fi + done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv') + + if [[ "$all_heavynodes_compatible" == true ]]; then + echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version." + return 0 + fi + + echo "One or more heavynodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + done + + return 1 + } + + if [[ ! -f "$es_verification_script" ]]; then + create_intermediate_upgrade_verification_script "$es_verification_script" + fi + for statefile in "${es_required_version_statefile_base}"-*; do [[ -f $statefile ]] || continue @@ -1012,10 +1125,6 @@ verify_es_version_compatibility() { continue fi - if [[ ! -f "$es_verification_script" ]]; then - create_intermediate_upgrade_verification_script "$es_verification_script" - fi - echo -e "\n##############################################################################################################################\n" echo "A previously required intermediate Elasticsearch upgrade was detected. Verifying that all Searchnodes/Heavynodes have successfully upgraded Elasticsearch to $es_required_version_statefile_value before proceeding with soup to avoid potential data loss! This command can take up to an hour to complete." if ! timeout --foreground 4000 bash "$es_verification_script" "$es_required_version_statefile_value" "$statefile"; then @@ -1037,6 +1146,26 @@ verify_es_version_compatibility() { # shellcheck disable=SC2076 # Do not want a regex here eg usage " 8.18.8 9.0.8 " =~ " 9.0.8 " if [[ " ${es_upgrade_map[$es_version]} " =~ " $target_es_version " || "$es_version" == "$target_es_version" ]]; then + if ! verify_searchnodes_es_target_compatibility || ! verify_heavynodes_es_target_compatibility; then + echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + + echo "One or more Searchnode(s)/Heavynode(s) cannot upgrade directly to Elasticsearch $target_es_version. This can happen with soups that include Elasticsearch upgrades being run in quick succession. Typically, this will resolve itself as the grid synchronizes. Please allow time for all Searchnodes/Heavynodes to have upgraded Elasticsearch to a compatible version with $target_es_version before running soup again to avoid potential data loss!" + + if [[ -n "$HEAVYNODE_ES_VERSIONS" ]]; then + echo "Current heavynode Elasticsearch versions:" + echo "$HEAVYNODE_ES_VERSIONS" | jq '.' + fi + + if [[ -n "$SEARCHNODE_ES_VERSIONS" ]]; then + echo "Current searchnode Elasticsearch versions:" + echo "$SEARCHNODE_ES_VERSIONS" | jq '.nodes | to_entries | map({(.value.name): .value.version}) | sort | add' + fi + + echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + + exit 161 + fi + # supported upgrade return 0 else @@ -1394,7 +1523,7 @@ main() { echo "Verifying we have the latest soup script." verify_latest_update_script - echo "Verifying Elasticsearch version compatibility before upgrading." + echo "Verifying Elasticsearch version compatibility across the grid before upgrading." verify_es_version_compatibility echo "Let's see if we need to update Security Onion."