From 564815e836e36c31ddbecd48b458d6e85be20b19 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 24 Apr 2026 10:46:29 -0400 Subject: [PATCH 1/3] redo how services are stopped during reinstall --- setup/so-functions | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index bf95ea9d8..a31c6f330 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1547,8 +1547,6 @@ reinstall_init() { local salt_services=( "salt-minion" ) fi - local service_retry_count=20 - { # remove all of root's cronjobs crontab -r -u root @@ -1563,31 +1561,30 @@ reinstall_init() { salt-call state.apply ca.remove -linfo --local --file-root=../salt - # Kill any salt processes (safely) + # Stop salt services, then force-kill any lingering salt processes so dnf remove salt can run cleanly for service in "${salt_services[@]}"; do - # Stop the service in the background so we can exit after a certain amount of time - if check_service_status "$service"; then - systemctl stop "$service" & + if ! check_service_status "$service"; then + continue + fi + local service_pid + service_pid=$(pgrep -f "/usr/bin/${service}" | head -1) + info "Stopping $service (pid=${service_pid:-none})" + systemctl stop "$service" + if [[ -n "$service_pid" ]] && ps -p "$service_pid" > /dev/null 2>&1; then + timeout 30 tail --pid="$service_pid" -f /dev/null || { + info "$service (pid $service_pid) still alive after 30s, force-killing" + pkill -9 -ef "/usr/bin/${service}" + } fi - local pid=$! - - local count=0 - while check_service_status "$service"; do - if [[ $count -gt $service_retry_count ]]; then - echo "Could not stop $service after 1 minute, exiting setup." - - # Stop the systemctl process trying to kill the service, show user a message, then exit setup - kill -9 $pid - fail_setup - fi - - sleep 5 - ((count++)) - done done + # Catch any stray salt-call / salt CLI children that weren't parented to the service cgroup + pkill -9 -ef "/usr/bin/salt-call" 2>/dev/null + pkill -9 -ef "/usr/bin/python3 /bin/salt" 2>/dev/null + # Remove all salt configs - rm -rf /etc/salt/engines/* /etc/salt/grains /etc/salt/master /etc/salt/master.d/* /etc/salt/minion /etc/salt/minion.d/* /etc/salt/pki/* /etc/salt/proxy /etc/salt/proxy.d/* /var/cache/salt/ + dnf -y remove salt + rm -rf /etc/salt/ /var/cache/salt/ if command -v docker &> /dev/null; then # Stop and remove all so-* containers so files can be changed with more safety From 0722b681b1a06925db37ddea287f53697717f91d Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 24 Apr 2026 11:04:46 -0400 Subject: [PATCH 2/3] redo service stop on reinstall --- setup/so-functions | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index a31c6f330..61601aee7 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1561,27 +1561,39 @@ reinstall_init() { salt-call state.apply ca.remove -linfo --local --file-root=../salt - # Stop salt services, then force-kill any lingering salt processes so dnf remove salt can run cleanly + # Stop salt services and force-kill any lingering salt processes (including orphans + # from an earlier reinstall attempt where the unit file is gone but processes survive) + # so dnf remove salt can run cleanly for service in "${salt_services[@]}"; do - if ! check_service_status "$service"; then - continue - fi - local service_pid - service_pid=$(pgrep -f "/usr/bin/${service}" | head -1) - info "Stopping $service (pid=${service_pid:-none})" - systemctl stop "$service" - if [[ -n "$service_pid" ]] && ps -p "$service_pid" > /dev/null 2>&1; then - timeout 30 tail --pid="$service_pid" -f /dev/null || { - info "$service (pid $service_pid) still alive after 30s, force-killing" - pkill -9 -ef "/usr/bin/${service}" - } + if check_service_status "$service"; then + info "Stopping $service via systemctl" + systemctl stop "$service" fi done - # Catch any stray salt-call / salt CLI children that weren't parented to the service cgroup - pkill -9 -ef "/usr/bin/salt-call" 2>/dev/null + # Unconditionally force-kill any remaining salt binaries — these may be orphaned + # from a prior aborted reinstall (no unit file, so systemctl can't see them). + for salt_bin in salt-master salt-minion salt-call salt-api salt-syndic; do + if pgrep -f "/usr/bin/${salt_bin}" > /dev/null 2>&1; then + info "Force-killing lingering $salt_bin processes" + pkill -9 -ef "/usr/bin/${salt_bin}" 2>/dev/null + fi + done + # Catch stray `salt` CLI children from saltutil.kill_all_jobs / state.apply invocations pkill -9 -ef "/usr/bin/python3 /bin/salt" 2>/dev/null + # Give the kernel a moment to reap the killed processes before dnf removes the binaries + local kill_wait=0 + while pgrep -f "/usr/bin/salt-" > /dev/null 2>&1; do + if [[ $kill_wait -gt 10 ]]; then + info "Salt processes still present after SIGKILL + 10s wait; proceeding anyway" + pgrep -af "/usr/bin/salt-" | while read -r line; do info " lingering: $line"; done + break + fi + sleep 1 + ((kill_wait++)) + done + # Remove all salt configs dnf -y remove salt rm -rf /etc/salt/ /var/cache/salt/ From 02381fbbe9c28089e590a9457cc3b64ba720b56f Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 24 Apr 2026 11:33:21 -0400 Subject: [PATCH 3/3] stop salt-cloud , belt-and-suspenders against a broken/incomplete salt RPM --- setup/so-functions | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/setup/so-functions b/setup/so-functions index 61601aee7..ca58dbbcb 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1573,7 +1573,7 @@ reinstall_init() { # Unconditionally force-kill any remaining salt binaries — these may be orphaned # from a prior aborted reinstall (no unit file, so systemctl can't see them). - for salt_bin in salt-master salt-minion salt-call salt-api salt-syndic; do + for salt_bin in salt-master salt-minion salt-call salt-cloud; do if pgrep -f "/usr/bin/${salt_bin}" > /dev/null 2>&1; then info "Force-killing lingering $salt_bin processes" pkill -9 -ef "/usr/bin/${salt_bin}" 2>/dev/null @@ -1594,10 +1594,16 @@ reinstall_init() { ((kill_wait++)) done + # Clear the 'failed' state SIGKILL left on the units before removing the package + systemctl reset-failed salt-master.service salt-minion.service 2>/dev/null || true + # Remove all salt configs dnf -y remove salt rm -rf /etc/salt/ /var/cache/salt/ + # Drop systemd's in-memory references to the now-removed units + systemctl daemon-reload + if command -v docker &> /dev/null; then # Stop and remove all so-* containers so files can be changed with more safety if [[ $(docker ps -a -q --filter "name=so-" | wc -l) -gt 0 ]]; then