diff --git a/salt/common/tools/sbin/so-common b/salt/common/tools/sbin/so-common index aca8496f5..4bccfdbb9 100755 --- a/salt/common/tools/sbin/so-common +++ b/salt/common/tools/sbin/so-common @@ -162,6 +162,29 @@ check_salt_master_status() { return 0 } +# Wait until $minion shows up in the salt master's unaccepted-keys list. +# Used after saltify on a reinstall to replace the old `sleep 2 / state.show_top / +# sleep 2` dance — the new minion's key takes longer to appear than 2s on +# salt 3006.x and the subsequent salt-key -ya needs something to accept. +# Returns 0 as soon as the key is pending, 1 after attempts*delay seconds. +wait_for_minion_key_pending() { + local minion="$1" + local attempts="${2:-30}" + local delay="${3:-2}" + local count=0 + while ! salt-key -l pre --out=json 2>/dev/null \ + | python3 -c "import json,sys; d=json.load(sys.stdin); sys.exit(0 if '$minion' in d.get('minions_pre', []) else 1)" 2>/dev/null; do + ((count+=1)) + if [[ $count -ge $attempts ]]; then + echo "Gave up waiting for $minion to appear in salt-master's pending keys" + return 1 + fi + sleep "$delay" + done + echo "Minion $minion is pending acceptance after $((count * delay))s" + return 0 +} + # this is only intended to be used to check the status of the minion from a salt master check_salt_minion_status() { local minion="$1" diff --git a/setup/so-functions b/setup/so-functions index 2b7acf9a9..c7176cc4a 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1550,6 +1550,19 @@ reinstall_init() { local service_retry_count=20 { + # Snapshot pre-reinstall salt state before any destructive step so a + # failed reinstall leaves a usable post-mortem in the setup log. + echo "=== pre-reinstall salt diagnostic $(date -Iseconds) ===" + systemctl status salt-master --no-pager 2>&1 | head -40 || true + systemctl status salt-minion --no-pager 2>&1 | head -40 || true + journalctl -u salt-master --no-pager --since "-10 minutes" 2>&1 | tail -80 || true + journalctl -u salt-minion --no-pager --since "-10 minutes" 2>&1 | tail -80 || true + ls -laR /etc/salt 2>&1 | head -60 || true + ls -la /var/cache/salt 2>&1 | head -40 || true + [[ -f /etc/salt/master.rpmnew ]] && diff -u /etc/salt/master /etc/salt/master.rpmnew 2>&1 | head -80 || true + [[ -f /etc/salt/minion.rpmnew ]] && diff -u /etc/salt/minion /etc/salt/minion.rpmnew 2>&1 | head -40 || true + echo "=== end diagnostic ===" + # remove all of root's cronjobs crontab -r -u root @@ -1580,7 +1593,7 @@ reinstall_init() { kill -9 $pid fail_setup fi - + sleep 5 ((count++)) done diff --git a/setup/so-setup b/setup/so-setup index 46b11fc11..7b5abd61f 100755 --- a/setup/so-setup +++ b/setup/so-setup @@ -724,10 +724,18 @@ if ! [[ -f $install_opt_file ]]; then # Install salt saltify check_sos_appliance + # Wait for salt-master to be actually running and have its PKI + # ready after a fresh saltify. Without this, salt-key operations + # silently race the daemon and the key accept no-ops, which is + # what was causing reinstalls on 3.x to hang on state.show_top. + retry 30 2 "test -f /etc/salt/pki/master/master.pub" \ + || fail "salt-master did not initialize PKI after saltify" + check_salt_master_status \ + || fail "salt-master not accepting calls after saltify" + logCmd "salt-key -yd $MINION_ID" - sleep 2 # Debug RSA Key format errors - logCmd "salt-call state.show_top" - sleep 2 # Debug RSA Key format errors + wait_for_minion_key_pending "$MINION_ID" 30 2 \ + || fail "salt-minion never presented its key to salt-master" logCmd "salt-key -ya $MINION_ID" logCmd "salt-call saltutil.sync_all" # we need to sync the runner and generate the soqemussh user keys so that first highstate after license created