diff --git a/setup/so-functions b/setup/so-functions index 4821605f7..e8360c671 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -251,19 +251,19 @@ check_pass_match() { fi } +# False if stopped, true if running check_service_status() { local service_name=$1 echo "Checking service $service_name status" >> "$setup_log" 2>&1 systemctl status $service_name > /dev/null 2>&1 local status=$? - #true if there is an issue with the service false if it is running properly if [ $status -gt 0 ]; then - echo "$service_name is not running" >> "$setup_log" 2>&1 - echo 1; + echo " $service_name is not running" >> "$setup_log" 2>&1 + return 1; else - echo "$service_name is running" >> "$setup_log" 2>&1 - echo 0; + echo " $service_name is running" >> "$setup_log" 2>&1 + return 0; fi } @@ -273,28 +273,27 @@ check_salt_master_status() { salt-call saltutil.kill_all_jobs > /dev/null 2>&1 salt-call state.show_top > /dev/null 2>&1 local status=$? - #true if there is an issue talking to salt master if [ $status -gt 0 ]; then - echo 1; + echo " Could not talk to salt master" >> "$setup_log" 2>&1 + return 1; else - echo "Can talk to salt master" >> "$setup_log" 2>&1 - echo 0; + echo " Can talk to salt master" >> "$setup_log" 2>&1 + return 0; fi } check_salt_minion_status() { echo "Checking if the salt minion will respond to jobs" >> "$setup_log" 2>&1 - salt "$MINION_ID" test.ping >> "$setup_log" 2>&1 + salt "$MINION_ID" test.ping > /dev/null 2>&1 local status=$? - #true if there is an issue getting a job response from the minion if [ $status -gt 0 ]; then - echo 1; + echo " Minion did not respond" >> "$setup_log" 2>&1 + return 1; else - echo "Received job response from salt minion" >> "$setup_log" 2>&1 - echo 0; + echo " Received job response from salt minion" >> "$setup_log" 2>&1 + return 0; fi - } check_soremote_pass() { @@ -767,12 +766,12 @@ detect_os() { disable_auto_start() { - if crontab -l -u $INSTALLUSERNAME 2>&1 | grep so-setup > /dev/null 2>&1; then + if crontab -l -u $INSTALLUSERNAME 2>&1 | grep -q so-setup; then # Remove the automated setup script from crontab, if it exists logCmd "crontab -u $INSTALLUSERNAME -r" fi - if grep so-setup /home/$INSTALLUSERNAME/.bash_profile > /dev/null 2>&1; then + if grep -q so-setup /home/$INSTALLUSERNAME/.bash_profile; then # Truncate last line of the bash profile info "Removing auto-run of setup from bash profile" sed -i '$ d' /home/$INSTALLUSERNAME/.bash_profile >> "$setup_log" 2>&1 @@ -1384,13 +1383,47 @@ reserve_group_ids() { reinstall_init() { info "Putting system in state to run setup again" + + if [[ $install_type =~ ^(MANAGER|EVAL|HELIXSENSOR|MANAGERSEARCH|STANDALONE|FLEET|IMPORT)$ ]]; then + local salt_services=( "salt-master" "salt-minion" ) + else + local salt_services=( "salt-minion" ) + fi + + local service_retry_count=20 { - # Kill any salt processes - pkill -9 -ef /usr/bin/salt + if command -v salt-call &> /dev/null; then + # Disable schedule so highstate doesn't start running during the install + salt-call -l info schedule.disable + + # Kill any currently running salt jobs, also to prevent issues with highstate. + salt-call -l info saltutil.kill_all_jobs + fi + + # Kill any salt processes (safely) + for service in "${salt_services[@]}"; do + # Stop the service in the background so we can exit after a certain amount of time + systemctl stop "$service" & + local pid=$! + + local count=0 + while check_service_status "$service"; do + if [[ $count -gt $service_retry_count ]]; then + echo "Could not stop $service after 1 minute, exiting setup." + + # Stop the systemctl process trying to kill the service, show user a message, then exit setup + kill -9 $pid + kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 + fi + + sleep 5 + ((count++)) + done + done # Remove all salt configs - rm -rf /etc/salt/global /etc/salt/minion /etc/salt/master /etc/salt/pki/* + rm -rf /etc/salt/grains /etc/salt/minion /etc/salt/pki/* if command -v docker &> /dev/null; then # Stop and remove all so-* containers so files can be changed with more safety @@ -1411,7 +1444,7 @@ reinstall_init() { # Remove the old launcher package in case the config changes remove_package launcher-final - } >> $setup_log 2>&1 + } >> "$setup_log" 2>&1 } backup_dir() { @@ -1608,61 +1641,59 @@ salt_checkin() { "salt-master" \ "salt-minion" ) - local LOOP_COUNT=0 + local count=0 + for service in "${SALT_SERVICES[@]}"; do - echo "Stopping service $service" >> "$setup_log" 2>&1 - systemctl stop "$service" >> "$setup_log" 2>&1 - LOOP_COUNT=0 - while ! (( $(check_service_status $service) )); do - echo "$service still running" >> "$setup_log" 2>&1 - if [ $LOOP_COUNT -gt 60 ]; then - echo "$service could not be stopped in 60 seconds, exiting" >> "$setup_log" 2>&1 - exit 1 + { + echo "Restarting service $service" + systemctl restart "$service" & + local pid=$! + } >> "$setup_log" 2>&1 + + count=0 + while ! (check_service_status "$service"); do + # On final loop, kill the pid trying to restart service and try to manually kill then start it + if [ $count -eq 12 ]; then + { + kill -9 "$pid" + systemctl kill "$service" + systemctl start "$service" & + local pid=$! + } >> "$setup_log" 2>&1 fi - sleep 1; - ((LOOP_COUNT+=1)) + + if [ $count -gt 12 ]; then + echo "$service could not be restarted in 120 seconds, exiting" >> "$setup_log" 2>&1 + kill -9 "$pid" + kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 + fi + sleep 10; + ((count++)) done done - sleep 5; - - for service in "${SALT_SERVICES[@]}"; do - echo "Starting service $service" >> "$setup_log" 2>&1 - systemctl start "$service" >> "$setup_log" 2>&1 - LOOP_COUNT=0 - while (( $(check_service_status $service) )); do - echo "$service still not running" >> "$setup_log" 2>&1 - if [ $LOOP_COUNT -gt 60 ]; then - echo "$service could not be started in 60 seconds, exiting" >> "$setup_log" 2>&1 - exit 1 - fi - sleep 1; - ((LOOP_COUNT+=1)) - done - done - - sleep 5; - - LOOP_COUNT=0 - while (( $(check_salt_master_status) )); do + count=0 + while ! (check_salt_master_status); do echo "salt minion cannot talk to salt master" >> "$setup_log" 2>&1 - if [ $LOOP_COUNT -gt 30 ]; then + if [ $count -gt 30 ]; then echo "salt minion could not talk to salt master after 30 attempts, exiting" >> "$setup_log" 2>&1 - exit 1 + kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 fi sleep 1; - ((LOOP_COUNT+=1)) + ((count++)) done - LOOP_COUNT=0 - while (( $(check_salt_minion_status) )); do + count=0 + while ! (check_salt_minion_status); do echo "salt master did not get a job response from salt minion" >> "$setup_log" 2>&1 - if [ $LOOP_COUNT -gt 30 ]; then + if [ $count -gt 30 ]; then echo "salt master did not get a job response from salt minion after 30 attempts, exiting" >> "$setup_log" 2>&1 - exit 1 + kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1 fi + systemctl kill salt-minion + systemctl start salt-minion sleep 1; - ((LOOP_COUNT+=1)) + ((count++)) done echo " Confirming existence of the CA certificate" diff --git a/setup/so-setup b/setup/so-setup index 77c579cfc..1ea238a38 100755 --- a/setup/so-setup +++ b/setup/so-setup @@ -428,72 +428,86 @@ whiptail_make_changes # From here on changes will be made. echo "1" > /root/accept_changes -if [[ $is_reinstall ]]; then - reinstall_init -fi +# Set up handler for setup to exit early (use `kill -SIGUSR1 "$(ps --pid $$ -oppid=)"; exit 1` in child scripts) +trap 'catch $LINENO' SIGUSR1 -if [[ -n "$TURBO" ]]; then - use_turbo_proxy -fi - -if [[ "$setup_type" == 'iso' ]]; then - # Init networking so rest of install works - set_hostname - set_management_interface -fi - -disable_ipv6 -disable_auto_start - -if [[ "$setup_type" != 'iso' ]]; then - set_hostname -fi - -if [[ $is_minion ]]; then - add_mngr_ip_to_hosts -fi - -{ - mark_version; - clear_manager; -} >> $setup_log 2>&1 - - -if [[ $is_manager || $is_import ]]; then - { - generate_passwords; - secrets_pillar; - add_socore_user_manager; - } >> $setup_log 2>&1 -fi - -if [[ $is_manager && ! $is_eval ]]; then - add_soremote_user_manager >> $setup_log 2>&1 -fi +catch() { + info "Fatal error occurred at $1 in so-setup, failing setup." + whiptail_setup_failed + exit +} +# This block sets REDIRECTIT which is used by a function outside the below subshell { set_main_ip; set_redirect; } >> $setup_log 2>&1 -host_pillar >> $setup_log 2>&1 - -if [[ $is_minion || $is_import ]]; then - set_updates >> $setup_log 2>&1 - [ "$automated" == no ] && copy_ssh_key >> $setup_log 2>&1 -fi - # Begin install { # Set initial percentage to 0 export percentage=0 + + # Show initial progress message + set_progress_str 0 'Running initial configuration steps' + set_path - if [[ $is_manager && $is_airgap ]]; then - info "Creating airgap repo" - create_repo >> $setup_log 2>&1 + if [[ $is_reinstall ]]; then + reinstall_init + fi + + if [[ -n "$TURBO" ]]; then + use_turbo_proxy + fi + + if [[ "$setup_type" == 'iso' ]]; then + # Init networking so rest of install works + set_hostname >> $setup_log 2>&1 + set_management_interface + fi + + disable_ipv6 + disable_auto_start + + if [[ "$setup_type" != 'iso' ]]; then + set_hostname >> $setup_log 2>&1 + fi + + if [[ $is_minion ]]; then + add_mngr_ip_to_hosts + fi + + { + mark_version; + clear_manager; + } >> $setup_log 2>&1 + + + if [[ $is_manager || $is_import ]]; then + { + generate_passwords; + secrets_pillar; + add_socore_user_manager; + } >> $setup_log 2>&1 + fi + + if [[ $is_manager && ! $is_eval ]]; then + add_soremote_user_manager >> $setup_log 2>&1 + fi + + host_pillar >> $setup_log 2>&1 + + if [[ $is_minion || $is_import ]]; then + set_updates >> $setup_log 2>&1 + [ "$automated" == no ] && copy_ssh_key >> $setup_log 2>&1 + fi + + if [[ $is_manager && $is_airgap ]]; then + info "Creating airgap repo" + create_repo >> $setup_log 2>&1 airgap_rules >> $setup_log 2>&1 - fi + fi if [[ $is_minion ]]; then set_progress_str 1 'Configuring firewall' @@ -583,7 +597,7 @@ fi if [[ $is_minion ]]; then set_progress_str 22 'Checking if the Salt Minion needs to be updated' - salt-call state.apply salt.minion -l info >> $setup_log 2>&1 + salt-call state.apply -l info salt.minion >> $setup_log 2>&1 fi set_progress_str 23 'Generating CA and checking in'