This commit is contained in:
m0duspwnens
2020-11-13 16:02:28 -05:00
parent 4ce0b770a5
commit 0a807621cc
7 changed files with 131 additions and 6 deletions

View File

@@ -136,7 +136,4 @@ else
echo "Something went wrong..." echo "Something went wrong..."
fi fi
echo echo

View File

@@ -0,0 +1,107 @@
{% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%}
#!/bin/bash
#
# Copyright 2014,2015,2016,2017,2018,2019,2020 Security Onion Solutions, LLC
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# this script checks the time the file /opt/so/log/salt/state-apply-test was last modified and restarts the salt-minion service if it is outside a threshold date/time
# the file is modified via file.touch using a scheduled job healthcheck.salt-minion.state-apply-test that runs a state.apply.
# by default the file should be updated every 5-8 minutes.
# this allows us to test that the minion is able apply states and communicate with the master
# if the file is unable to be touched via the state.apply, then we assume there is a possibilty that the minion is hung (though it could be possible the master is down as well)
# we then stop the service, pkill salt-minion, the start the salt-minion service back up
. /usr/sbin/so-common
QUIET=false
UPTIME_REQ=1800 #in seconds, how long the box has to be up before considering restarting salt-minion due to /opt/so/log/salt/state-apply-test not being touched
CURRENT_TIME=$(date +%s)
SYSTEM_START_TIME=$(date -d "$(</proc/uptime awk '{print $1}') seconds ago" +%s)
LAST_HIGHSTATE_END=$([ -e "/opt/so/log/salt/lasthighstate" ] && date -r /opt/so/log/salt/lasthighstate +%s || echo 0)
LAST_HEALTHCHECK_STATE_APPLY=$([ -e "/opt/so/log/salt/state-apply-test" ] && date -r /opt/so/log/salt/state-apply-test +%s || echo 0)
# SETTING THRESHOLD TO ANYTHING UNDER 600 seconds may cause a lot of salt-minion restarts
THRESHOLD={{SALT_MINION_DEFAULTS.salt.minion.check-threshold}} #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted
THRESHOLD_DATE=$((CURRENT_TIME-THRESHOLD))
logCmd() {
cmd=$1
info "Executing command: $cmd"
$cmd >> "/opt/so/log/salt/so-salt-minion-check.log"
}
log() {
msg=$1
level=${2:-I}
now=$(TZ=GMT date +"%Y-%m-%dT%H:%M:%SZ")
if ! $QUIET; then
echo $msg
fi
echo -e "$now | $level | so-salt-minion-check | $msg" >> "/opt/so/log/salt/so-salt-minion-check.log" 2>&1
}
error() {
log "$1" "E"
}
info() {
log "$1" "I"
}
usage()
{
cat <<EOF
Check health of salt-minion and restart it if needed
Options:
-h This message
-q Don't output to terminal
EOF
}
while getopts ":q" opt; do
case "$opt" in
q )
QUIET=true
;;
* ) usage
exit 0
;;
esac
done
log "running so-salt-minion-check"
if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then
log "system uptime is at least $UPTIME_REQ seconds" I
if [ $LAST_HEALTHCHECK_STATE_APPLY -le $THRESHOLD_DATE ]; then
log "salt-minion is unable to apply states" E
log "/opt/so/log/salt/healthcheck-state-apply ($LAST_HEALTHCHECK_STATE_APPLY) older than threshold date ($THRESHOLD_DATE)" I
log "last highstate completed at $LAST_HIGHSTATE_END" I
log "checking if any jobs are running" I
logCmd "salt-call --local saltutil.running" I
log "stopping salt-minion service" I
logCmd "timeout -k10 -s9 120 systemctl stop salt-minion" I
log "killing any leftover salt-minion processes" I
logCmd "pkill -9 -ef /usr/bin/salt-minion" I
log "starting salt-minion service" I
logCmd "systemctl start salt-minion" I
else
log "/opt/so/log/salt/healthcheck-state-apply ($LAST_HEALTHCHECK_STATE_APPLY) newer than threshold date ($THRESHOLD_DATE)" I
fi
else
log "system uptime only $((CURRENT_TIME-SYSTEM_START_TIME)) seconds does not meet $UPTIME_REQ second requirement." I
fi

View File

@@ -0,0 +1,16 @@
state-apply-test:
schedule.present:
- name: salt-minion-state-apply-test
- function: state.sls
- job_args:
- salt.minion-state-apply-test
- minutes: 5
- splay:
start: 0
end: 180
/usr/bin/so-salt-minon-check -q:
cron.present:
- identifier: so-salt-minion-check
- user: root
- minute: '*/5'

View File

@@ -0,0 +1,3 @@
minion-state-apply-test:
file.touch:
- name: /opt/so/log/salt/state-apply-test

View File

@@ -2,4 +2,5 @@
# When updating the salt version, also update the version in securityonion-builds/images/iso-task/Dockerfile and saltify function in so-functions # When updating the salt version, also update the version in securityonion-builds/images/iso-task/Dockerfile and saltify function in so-functions
salt: salt:
minion: minion:
version: 3002.1 version: 3002.1
check-threshold: 3600 # in seconds, threshold used for so-salt-minion-check. setting less that 600 cause cause a lot of salt-minion restarts

View File

@@ -23,4 +23,4 @@ salt_minion_package:
salt_minion_service: salt_minion_service:
service.running: service.running:
- name: salt-minion - name: salt-minion
- enable: True - enable: True

View File

@@ -42,6 +42,7 @@ base:
- common - common
- patch.os.schedule - patch.os.schedule
- motd - motd
- salt.minion-check
- salt.lasthighstate - salt.lasthighstate
'*_helix and G@saltversion:{{saltversion}}': '*_helix and G@saltversion:{{saltversion}}':