From af8295a65130894f1b8984c3097864548ffb7c87 Mon Sep 17 00:00:00 2001 From: William Wernert Date: Wed, 2 Dec 2020 17:07:49 -0500 Subject: [PATCH 01/10] [reafactor] systemctl stop -> kill --- setup/so-functions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup/so-functions b/setup/so-functions index 6aa30f89c..4103f0988 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1400,7 +1400,7 @@ reinstall_init() { # Kill any salt processes (safely) for service in "${salt_services[@]}"; do # Stop the service in the background so we can exit after a certain amount of time - systemctl stop "$service" & + systemctl kill "$service" & local pid=$! local count=0 From 76fff28dfa5a85f217435e7cec3018e1915b9876 Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 10:18:44 -0500 Subject: [PATCH 02/10] [fix] Correct logic for service check + bash trap --- setup/so-functions | 2 +- setup/so-setup | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index 4103f0988..76e579765 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1404,7 +1404,7 @@ reinstall_init() { local pid=$! local count=0 - while check_service_status "$service"; do + while ! (check_service_status "$service"); do if [[ $count -gt $service_retry_count ]]; then echo "Could not stop $service after 1 minute, exiting setup." diff --git a/setup/so-setup b/setup/so-setup index 79ba916a9..924bdf307 100755 --- a/setup/so-setup +++ b/setup/so-setup @@ -485,14 +485,12 @@ fi # Exit parent script if -trap 'catch $? $LINENO' SIGUSR1 +trap 'catch $LINENO' SIGUSR1 catch() { - if [ "$1" != 0 ]; then - info "Fatal error occurred at $2 in so-setup, failing setup." - whiptail_setup_failed - exit - fi + info "Fatal error occurred at $2 in so-setup, failing setup." + whiptail_setup_failed + exit } # Begin install From 2c208ec943a8fc45f912731076009c99bf19503a Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 10:31:45 -0500 Subject: [PATCH 03/10] [fix] kill -> stop, add indent to service check, revert incorrect logic --- setup/so-functions | 8 ++++---- setup/so-setup | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index 76e579765..767ca6288 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -259,10 +259,10 @@ check_service_status() { systemctl status $service_name > /dev/null 2>&1 local status=$? if [ $status -gt 0 ]; then - echo "$service_name is not running" >> "$setup_log" 2>&1 + echo " $service_name is not running" >> "$setup_log" 2>&1 return 1; else - echo "$service_name is running" >> "$setup_log" 2>&1 + echo " $service_name is running" >> "$setup_log" 2>&1 return 0; fi @@ -1400,11 +1400,11 @@ reinstall_init() { # Kill any salt processes (safely) for service in "${salt_services[@]}"; do # Stop the service in the background so we can exit after a certain amount of time - systemctl kill "$service" & + systemctl stop "$service" & local pid=$! local count=0 - while ! (check_service_status "$service"); do + while check_service_status "$service"; do if [[ $count -gt $service_retry_count ]]; then echo "Could not stop $service after 1 minute, exiting setup." diff --git a/setup/so-setup b/setup/so-setup index 924bdf307..3bec2bb87 100755 --- a/setup/so-setup +++ b/setup/so-setup @@ -499,11 +499,11 @@ catch() { export percentage=0 set_path - if [[ $is_manager && $is_airgap ]]; then - info "Creating airgap repo" - create_repo >> $setup_log 2>&1 + if [[ $is_manager && $is_airgap ]]; then + info "Creating airgap repo" + create_repo >> $setup_log 2>&1 airgap_rules >> $setup_log 2>&1 - fi + fi if [[ $is_minion ]]; then set_progress_str 1 'Configuring firewall' From 80ce8b5e41c6573bde3de15ef636ecc8f26c1d81 Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 13:59:25 -0500 Subject: [PATCH 04/10] [refactor] Run all changes inside whiptail progress, use grep -q --- setup/so-functions | 4 +- setup/so-setup | 117 +++++++++++++++++++++++---------------------- 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index 767ca6288..8c23441ed 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -766,12 +766,12 @@ detect_os() { disable_auto_start() { - if crontab -l -u $INSTALLUSERNAME 2>&1 | grep so-setup > /dev/null 2>&1; then + if crontab -l -u $INSTALLUSERNAME 2>&1 | grep -q so-setup; then # Remove the automated setup script from crontab, if it exists logCmd "crontab -u $INSTALLUSERNAME -r" fi - if grep so-setup /home/$INSTALLUSERNAME/.bash_profile > /dev/null 2>&1; then + if grep -q so-setup /home/$INSTALLUSERNAME/.bash_profile; then # Truncate last line of the bash profile info "Removing auto-run of setup from bash profile" sed -i '$ d' /home/$INSTALLUSERNAME/.bash_profile >> "$setup_log" 2>&1 diff --git a/setup/so-setup b/setup/so-setup index 3bec2bb87..73363959c 100755 --- a/setup/so-setup +++ b/setup/so-setup @@ -428,63 +428,7 @@ whiptail_make_changes # From here on changes will be made. echo "1" > /root/accept_changes -if [[ $is_reinstall ]]; then - reinstall_init -fi - -if [[ -n "$TURBO" ]]; then - use_turbo_proxy -fi - -if [[ "$setup_type" == 'iso' ]]; then - # Init networking so rest of install works - set_hostname - set_management_interface -fi - -disable_ipv6 -disable_auto_start - -if [[ "$setup_type" != 'iso' ]]; then - set_hostname -fi - -if [[ $is_minion ]]; then - add_mngr_ip_to_hosts -fi - -{ - mark_version; - clear_manager; -} >> $setup_log 2>&1 - - -if [[ $is_manager || $is_import ]]; then - { - generate_passwords; - secrets_pillar; - add_socore_user_manager; - } >> $setup_log 2>&1 -fi - -if [[ $is_manager && ! $is_eval ]]; then - add_soremote_user_manager >> $setup_log 2>&1 -fi - -{ - set_main_ip; - set_redirect; -} >> $setup_log 2>&1 - -host_pillar >> $setup_log 2>&1 - -if [[ $is_minion || $is_import ]]; then - set_updates >> $setup_log 2>&1 - [ "$automated" == no ] && copy_ssh_key >> $setup_log 2>&1 -fi - - -# Exit parent script if +# Set up handler for setup to exit early (use `kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1` in child scripts) trap 'catch $LINENO' SIGUSR1 catch() { @@ -497,8 +441,67 @@ catch() { { # Set initial percentage to 0 export percentage=0 + + # Show initial progress message + set_progress_str 0 'Running initial configuration steps' + set_path + if [[ $is_reinstall ]]; then + reinstall_init + fi + + if [[ -n "$TURBO" ]]; then + use_turbo_proxy + fi + + if [[ "$setup_type" == 'iso' ]]; then + # Init networking so rest of install works + set_hostname >> $setup_log 2>&1 + set_management_interface + fi + + disable_ipv6 + disable_auto_start + + if [[ "$setup_type" != 'iso' ]]; then + set_hostname >> $setup_log 2>&1 + fi + + if [[ $is_minion ]]; then + add_mngr_ip_to_hosts + fi + + { + mark_version; + clear_manager; + } >> $setup_log 2>&1 + + + if [[ $is_manager || $is_import ]]; then + { + generate_passwords; + secrets_pillar; + add_socore_user_manager; + } >> $setup_log 2>&1 + fi + + if [[ $is_manager && ! $is_eval ]]; then + add_soremote_user_manager >> $setup_log 2>&1 + fi + + { + set_main_ip; + set_redirect; + } >> $setup_log 2>&1 + + host_pillar >> $setup_log 2>&1 + + if [[ $is_minion || $is_import ]]; then + set_updates >> $setup_log 2>&1 + [ "$automated" == no ] && copy_ssh_key >> $setup_log 2>&1 + fi + if [[ $is_manager && $is_airgap ]]; then info "Creating airgap repo" create_repo >> $setup_log 2>&1 From 3049718660d92d39492f2cac6433be6b8961d5fc Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 14:42:13 -0500 Subject: [PATCH 05/10] [fix] Kill + start salt-minion if it isn't responding --- setup/so-functions | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index 8c23441ed..b42e03bb7 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1382,10 +1382,12 @@ reserve_group_ids() { reinstall_init() { info "Putting system in state to run setup again" - local salt_services=( - "salt-master" - "salt-minion" - ) + if [[ $install_type =~ ^(MANAGER|EVAL|HELIXSENSOR|MANAGERSEARCH|STANDALONE|FLEET|IMPORT)$ ]]; then + local salt_services=( "salt-master" "salt-minion" ) + else + local salt_services=( "salt-minion" ) + fi + local service_retry_count=20 { @@ -1412,6 +1414,7 @@ reinstall_init() { kill -9 $pid kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 fi + sleep 5 ((count++)) done @@ -1671,6 +1674,8 @@ salt_checkin() { count=0 while ! (check_salt_minion_status); do echo "salt master did not get a job response from salt minion" >> "$setup_log" 2>&1 + systemctl kill salt-minion + systemctl start salt-minion if [ $count -gt 30 ]; then echo "salt master did not get a job response from salt minion after 30 attempts, exiting" >> "$setup_log" 2>&1 kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 From b5bfad07dc3c53d6ebe301b29ef33f29437cc1ba Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 14:55:23 -0500 Subject: [PATCH 06/10] [fix] kill/start after if statement --- setup/so-functions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index b42e03bb7..30399170f 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1674,12 +1674,12 @@ salt_checkin() { count=0 while ! (check_salt_minion_status); do echo "salt master did not get a job response from salt minion" >> "$setup_log" 2>&1 - systemctl kill salt-minion - systemctl start salt-minion if [ $count -gt 30 ]; then echo "salt master did not get a job response from salt minion after 30 attempts, exiting" >> "$setup_log" 2>&1 kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 fi + systemctl kill salt-minion + systemctl start salt-minion sleep 1; ((count++)) done From ac85cbc3f19516901249dcd7902323896cd377fa Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 15:10:41 -0500 Subject: [PATCH 07/10] [fix] Move set_redirect out of sub-shell --- setup/so-setup | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/setup/so-setup b/setup/so-setup index 73363959c..8dcce0e9b 100755 --- a/setup/so-setup +++ b/setup/so-setup @@ -437,6 +437,12 @@ catch() { exit } +# This block sets REDIRECTIT which is used by a function outside the below subshell +{ + set_main_ip; + set_redirect; +} >> $setup_log 2>&1 + # Begin install { # Set initial percentage to 0 @@ -490,11 +496,6 @@ catch() { add_soremote_user_manager >> $setup_log 2>&1 fi - { - set_main_ip; - set_redirect; - } >> $setup_log 2>&1 - host_pillar >> $setup_log 2>&1 if [[ $is_minion || $is_import ]]; then From ebade0a5a6a6083ddba30080b336122f0d0ddb64 Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 15:20:33 -0500 Subject: [PATCH 08/10] [fix] Also kill+start while trying to restart service initially --- setup/so-functions | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup/so-functions b/setup/so-functions index 30399170f..e17fa23ce 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1650,7 +1650,8 @@ salt_checkin() { count=0 while ! (check_service_status "$service"); do - echo "$service still not running" >> "$setup_log" 2>&1 + systemctl kill "$service" + systemctl start "$service" if [ $count -gt 120 ]; then echo "$service could not be restarted in 120 seconds, exiting" >> "$setup_log" 2>&1 kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 From 660c768f8f9a4c5ee33ad6f1f1c9fe7f9853580c Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 15:26:59 -0500 Subject: [PATCH 09/10] Only kill+start on final loop and increase time between status checks --- setup/so-functions | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index e17fa23ce..d6c309431 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1650,13 +1650,16 @@ salt_checkin() { count=0 while ! (check_service_status "$service"); do - systemctl kill "$service" - systemctl start "$service" - if [ $count -gt 120 ]; then + if [ $count -eq 12 ]; then + systemctl kill "$service" + systemctl start "$service" + fi + + if [ $count -gt 12 ]; then echo "$service could not be restarted in 120 seconds, exiting" >> "$setup_log" 2>&1 kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 fi - sleep 1; + sleep 10; ((count++)) done done From 3273a6366235bc7a7f26f88e4c55f1b6a1aaf7eb Mon Sep 17 00:00:00 2001 From: William Wernert Date: Thu, 3 Dec 2020 15:35:50 -0500 Subject: [PATCH 10/10] [fix] kill old restart pid and assign new pid for start --- setup/so-functions | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index d6c309431..67cbb7c24 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -1650,13 +1650,19 @@ salt_checkin() { count=0 while ! (check_service_status "$service"); do + # On final loop, kill the pid trying to restart service and try to manually kill then start it if [ $count -eq 12 ]; then - systemctl kill "$service" - systemctl start "$service" + { + kill -9 "$pid" + systemctl kill "$service" + systemctl start "$service" & + local pid=$! + } >> "$setup_log" 2>&1 fi if [ $count -gt 12 ]; then echo "$service could not be restarted in 120 seconds, exiting" >> "$setup_log" 2>&1 + kill -9 "$pid" kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 fi sleep 10;