From 0a807621ccae24cb1b61bd462918946b3bd3bbd9 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 16:02:28 -0500 Subject: [PATCH 01/12] check health of salt-minion https://github.com/Security-Onion-Solutions/securityonion/issues/1831 --- salt/common/tools/sbin/so-elastalert-test | 5 +- salt/common/tools/sbin/so-salt-minion-check | 107 ++++++++++++++++++++ salt/salt/minion-check.sls | 16 +++ salt/salt/minion-state-apply-test.sls | 3 + salt/salt/minion.defaults.yaml | 3 +- salt/salt/minion.sls | 2 +- salt/top.sls | 1 + 7 files changed, 131 insertions(+), 6 deletions(-) create mode 100644 salt/common/tools/sbin/so-salt-minion-check create mode 100644 salt/salt/minion-check.sls create mode 100644 salt/salt/minion-state-apply-test.sls diff --git a/salt/common/tools/sbin/so-elastalert-test b/salt/common/tools/sbin/so-elastalert-test index e72d928ed..ccb823168 100755 --- a/salt/common/tools/sbin/so-elastalert-test +++ b/salt/common/tools/sbin/so-elastalert-test @@ -136,7 +136,4 @@ else echo "Something went wrong..." fi -echo - - - +echo \ No newline at end of file diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check new file mode 100644 index 000000000..240d3b908 --- /dev/null +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -0,0 +1,107 @@ +{% import_yaml 'salt/minion.defaults.yaml' as SALT_MINION_DEFAULTS -%} + +#!/bin/bash +# +# Copyright 2014,2015,2016,2017,2018,2019,2020 Security Onion Solutions, LLC +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# this script checks the time the file /opt/so/log/salt/state-apply-test was last modified and restarts the salt-minion service if it is outside a threshold date/time +# the file is modified via file.touch using a scheduled job healthcheck.salt-minion.state-apply-test that runs a state.apply. +# by default the file should be updated every 5-8 minutes. +# this allows us to test that the minion is able apply states and communicate with the master +# if the file is unable to be touched via the state.apply, then we assume there is a possibilty that the minion is hung (though it could be possible the master is down as well) +# we then stop the service, pkill salt-minion, the start the salt-minion service back up + +. /usr/sbin/so-common + +QUIET=false +UPTIME_REQ=1800 #in seconds, how long the box has to be up before considering restarting salt-minion due to /opt/so/log/salt/state-apply-test not being touched +CURRENT_TIME=$(date +%s) +SYSTEM_START_TIME=$(date -d "$(> "/opt/so/log/salt/so-salt-minion-check.log" +} + +log() { + msg=$1 + level=${2:-I} + now=$(TZ=GMT date +"%Y-%m-%dT%H:%M:%SZ") + if ! $QUIET; then + echo $msg + fi + echo -e "$now | $level | so-salt-minion-check | $msg" >> "/opt/so/log/salt/so-salt-minion-check.log" 2>&1 +} + +error() { + log "$1" "E" +} + +info() { + log "$1" "I" +} + +usage() +{ +cat < Date: Fri, 13 Nov 2020 17:08:47 -0500 Subject: [PATCH 02/12] change var name --- salt/common/tools/sbin/so-salt-minion-check | 2 +- salt/salt/minion.defaults.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index 240d3b908..a28fd1367 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -33,7 +33,7 @@ SYSTEM_START_TIME=$(date -d "$( Date: Fri, 13 Nov 2020 17:25:45 -0500 Subject: [PATCH 03/12] sbin --- salt/salt/minion-check.sls | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/salt/minion-check.sls b/salt/salt/minion-check.sls index 103a36d49..747454ef3 100644 --- a/salt/salt/minion-check.sls +++ b/salt/salt/minion-check.sls @@ -9,7 +9,7 @@ state-apply-test: start: 0 end: 180 -/usr/bin/so-salt-minon-check -q: +/usr/sbin/so-salt-minon-check -q: cron.present: - identifier: so-salt-minion-check - user: root From a5823be0acd101100f372a23dec1aef1a8717063 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 17:55:19 -0500 Subject: [PATCH 04/12] fix typo --- salt/salt/minion-check.sls | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/salt/minion-check.sls b/salt/salt/minion-check.sls index 747454ef3..d8dc4c418 100644 --- a/salt/salt/minion-check.sls +++ b/salt/salt/minion-check.sls @@ -9,7 +9,7 @@ state-apply-test: start: 0 end: 180 -/usr/sbin/so-salt-minon-check -q: +/usr/sbin/so-salt-minion-check -q: cron.present: - identifier: so-salt-minion-check - user: root From 71a409f21035dc58071d1ad34a3cb5b8a51b8c69 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 18:23:55 -0500 Subject: [PATCH 05/12] fix threshold logic https://github.com/Security-Onion-Solutions/securityonion/issues/1831 --- salt/common/tools/sbin/so-salt-minion-check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index a28fd1367..0d69c7e96 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -34,7 +34,7 @@ LAST_HIGHSTATE_END=$([ -e "/opt/so/log/salt/lasthighstate" ] && date -r /opt/so/ LAST_HEALTHCHECK_STATE_APPLY=$([ -e "/opt/so/log/salt/state-apply-test" ] && date -r /opt/so/log/salt/state-apply-test +%s || echo 0) # SETTING THRESHOLD TO ANYTHING UNDER 600 seconds may cause a lot of salt-minion restarts THRESHOLD={{SALT_MINION_DEFAULTS.salt.minion.check_threshold}} #within how many seconds the file /opt/so/log/salt/state-apply-test must have been touched/modified before the salt minion is restarted -THRESHOLD_DATE=$((CURRENT_TIME-THRESHOLD)) +THRESHOLD_DATE=$((LAST_HEALTHCHECK_STATE_APPLY+THRESHOLD)) logCmd() { cmd=$1 @@ -87,7 +87,7 @@ log "running so-salt-minion-check" if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then log "system uptime is at least $UPTIME_REQ seconds" I - if [ $LAST_HEALTHCHECK_STATE_APPLY -le $THRESHOLD_DATE ]; then + if [ $THRESHOLD_DATE -le $CURRENT_TIME ]; then log "salt-minion is unable to apply states" E log "/opt/so/log/salt/healthcheck-state-apply ($LAST_HEALTHCHECK_STATE_APPLY) older than threshold date ($THRESHOLD_DATE)" I log "last highstate completed at $LAST_HIGHSTATE_END" I From e820c6fa422759495360a0202f24a3db128a0dbe Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 19:04:09 -0500 Subject: [PATCH 06/12] logging changes issue/1831 --- salt/common/tools/sbin/so-salt-minion-check | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index 0d69c7e96..d7b7a4d85 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -86,10 +86,9 @@ done log "running so-salt-minion-check" if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then - log "system uptime is at least $UPTIME_REQ seconds" I if [ $THRESHOLD_DATE -le $CURRENT_TIME ]; then log "salt-minion is unable to apply states" E - log "/opt/so/log/salt/healthcheck-state-apply ($LAST_HEALTHCHECK_STATE_APPLY) older than threshold date ($THRESHOLD_DATE)" I + log "/opt/so/log/salt/healthcheck-state-apply not touched by threshold date: `date -d @$THRESHOLD_DATE`, last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY`" I log "last highstate completed at $LAST_HIGHSTATE_END" I log "checking if any jobs are running" I logCmd "salt-call --local saltutil.running" I @@ -100,7 +99,7 @@ if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then log "starting salt-minion service" I logCmd "systemctl start salt-minion" I else - log "/opt/so/log/salt/healthcheck-state-apply ($LAST_HEALTHCHECK_STATE_APPLY) newer than threshold date ($THRESHOLD_DATE)" I + log "/opt/so/log/salt/healthcheck-state-apply touched by threshold date: `date -d @$THRESHOLD_DATE`, last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY`" I fi else log "system uptime only $((CURRENT_TIME-SYSTEM_START_TIME)) seconds does not meet $UPTIME_REQ second requirement." I From b210092534fc5527faf6acda03dc222a80548151 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 19:09:53 -0500 Subject: [PATCH 07/12] logging changes issue/1831 --- salt/common/tools/sbin/so-salt-minion-check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index d7b7a4d85..74718e192 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -89,7 +89,7 @@ if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then if [ $THRESHOLD_DATE -le $CURRENT_TIME ]; then log "salt-minion is unable to apply states" E log "/opt/so/log/salt/healthcheck-state-apply not touched by threshold date: `date -d @$THRESHOLD_DATE`, last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY`" I - log "last highstate completed at $LAST_HIGHSTATE_END" I + log "last highstate completed at `date -d @$LAST_HIGHSTATE_END`" I log "checking if any jobs are running" I logCmd "salt-call --local saltutil.running" I log "stopping salt-minion service" I From e958246457f241f4f586770d50c150e35ec7fc36 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 19:34:17 -0500 Subject: [PATCH 08/12] touch file at start of highstate, just kill salt dont systemctl stop it https://github.com/Security-Onion-Solutions/securityonion/issues/1831 --- salt/common/tools/sbin/so-salt-minion-check | 4 +--- salt/salt/minion-check.sls | 3 +++ salt/salt/minion-state-apply-test.sls | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index 74718e192..84f72bb72 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -92,9 +92,7 @@ if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then log "last highstate completed at `date -d @$LAST_HIGHSTATE_END`" I log "checking if any jobs are running" I logCmd "salt-call --local saltutil.running" I - log "stopping salt-minion service" I - logCmd "timeout -k10 -s9 120 systemctl stop salt-minion" I - log "killing any leftover salt-minion processes" I + log "killing all salt-minion processes" I logCmd "pkill -9 -ef /usr/bin/salt-minion" I log "starting salt-minion service" I logCmd "systemctl start salt-minion" I diff --git a/salt/salt/minion-check.sls b/salt/salt/minion-check.sls index d8dc4c418..e8a0c2639 100644 --- a/salt/salt/minion-check.sls +++ b/salt/salt/minion-check.sls @@ -1,3 +1,6 @@ +include: + - salt.minion-state-apply-test + state-apply-test: schedule.present: - name: salt-minion-state-apply-test diff --git a/salt/salt/minion-state-apply-test.sls b/salt/salt/minion-state-apply-test.sls index 4da77419b..9d7e90e63 100644 --- a/salt/salt/minion-state-apply-test.sls +++ b/salt/salt/minion-state-apply-test.sls @@ -1,3 +1,4 @@ minion-state-apply-test: file.touch: - - name: /opt/so/log/salt/state-apply-test \ No newline at end of file + - name: /opt/so/log/salt/state-apply-test + - order: first \ No newline at end of file From 43a244e0da29d8d6dded25b4cb66716b3c0b4c3f Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 19:37:03 -0500 Subject: [PATCH 09/12] change log path https://github.com/Security-Onion-Solutions/securityonion/issues/1831 --- salt/common/tools/sbin/so-salt-minion-check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index 84f72bb72..d60dcf5c4 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -39,7 +39,7 @@ THRESHOLD_DATE=$((LAST_HEALTHCHECK_STATE_APPLY+THRESHOLD)) logCmd() { cmd=$1 info "Executing command: $cmd" - $cmd >> "/opt/so/log/salt/so-salt-minion-check.log" + $cmd >> "/opt/so/log/salt/so-salt-minion-check" } log() { @@ -49,7 +49,7 @@ log() { if ! $QUIET; then echo $msg fi - echo -e "$now | $level | so-salt-minion-check | $msg" >> "/opt/so/log/salt/so-salt-minion-check.log" 2>&1 + echo -e "$now | $level | so-salt-minion-check | $msg" >> "/opt/so/log/salt/so-salt-minion-check" 2>&1 } error() { From 4e6e29e7dca55e4d85628adfb6ecf5f46d987884 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Fri, 13 Nov 2020 20:26:06 -0500 Subject: [PATCH 10/12] update logging --- salt/common/tools/sbin/so-salt-minion-check | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index d60dcf5c4..25302802c 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -88,7 +88,7 @@ log "running so-salt-minion-check" if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then if [ $THRESHOLD_DATE -le $CURRENT_TIME ]; then log "salt-minion is unable to apply states" E - log "/opt/so/log/salt/healthcheck-state-apply not touched by threshold date: `date -d @$THRESHOLD_DATE`, last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY`" I + log "/opt/so/log/salt/healthcheck-state-apply not touched by required date: `date -d @$THRESHOLD_DATE`, last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY`" I log "last highstate completed at `date -d @$LAST_HIGHSTATE_END`" I log "checking if any jobs are running" I logCmd "salt-call --local saltutil.running" I @@ -97,7 +97,7 @@ if [ $CURRENT_TIME -ge $((SYSTEM_START_TIME+$UPTIME_REQ)) ]; then log "starting salt-minion service" I logCmd "systemctl start salt-minion" I else - log "/opt/so/log/salt/healthcheck-state-apply touched by threshold date: `date -d @$THRESHOLD_DATE`, last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY`" I + log "/opt/so/log/salt/healthcheck-state-apply last touched: `date -d @$LAST_HEALTHCHECK_STATE_APPLY` must be touched by `date -d @$THRESHOLD_DATE` to avoid salt-minion restart" I fi else log "system uptime only $((CURRENT_TIME-SYSTEM_START_TIME)) seconds does not meet $UPTIME_REQ second requirement." I From 5c25dcf1923a1b8bbe44a0c106d12989bdc5533f Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Mon, 16 Nov 2020 09:50:10 -0500 Subject: [PATCH 11/12] add /opt/so/log/salt/so-salt-minion-check to log rotate https://github.com/Security-Onion-Solutions/securityonion/issues/1831 --- salt/common/files/log-rotate.conf | 1 + salt/common/tools/sbin/so-salt-minion-check | 4 ++-- salt/salt/minion.defaults.yaml | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/salt/common/files/log-rotate.conf b/salt/common/files/log-rotate.conf index d383981cd..8f1df0307 100644 --- a/salt/common/files/log-rotate.conf +++ b/salt/common/files/log-rotate.conf @@ -18,6 +18,7 @@ /opt/so/log/filebeat/*.log /opt/so/log/telegraf/*.log /opt/so/log/redis/*.log +/opt/so/log/salt/so-salt-minion-check { {{ logrotate_conf | indent(width=4) }} } diff --git a/salt/common/tools/sbin/so-salt-minion-check b/salt/common/tools/sbin/so-salt-minion-check index 25302802c..a8828b16e 100644 --- a/salt/common/tools/sbin/so-salt-minion-check +++ b/salt/common/tools/sbin/so-salt-minion-check @@ -32,7 +32,7 @@ CURRENT_TIME=$(date +%s) SYSTEM_START_TIME=$(date -d "$(> "/opt/so/log/salt/so-salt-minion-check" 2>&1 + echo -e "$now | $level | $msg" >> "/opt/so/log/salt/so-salt-minion-check" 2>&1 } error() { diff --git a/salt/salt/minion.defaults.yaml b/salt/salt/minion.defaults.yaml index 1513803d8..871babdeb 100644 --- a/salt/salt/minion.defaults.yaml +++ b/salt/salt/minion.defaults.yaml @@ -3,4 +3,4 @@ salt: minion: version: 3002.1 - check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. setting less that 600 cause cause a lot of salt-minion restarts \ No newline at end of file + check_threshold: 3600 # in seconds, threshold used for so-salt-minion-check. any value less than 600 seconds may cause a lot of salt-minion restarts since the job to touch the file occurs every 5-8 minutes by default \ No newline at end of file From cc50eba6cbb3302ec3630095142b304c83f16a71 Mon Sep 17 00:00:00 2001 From: m0duspwnens Date: Mon, 16 Nov 2020 10:01:40 -0500 Subject: [PATCH 12/12] make sure /opt/so/log/salt/so-salt-minion-check gets touched even if salt-minon verison isnt correct https://github.com/Security-Onion-Solutions/securityonion/issues/1831 --- salt/top.sls | 1 + 1 file changed, 1 insertion(+) diff --git a/salt/top.sls b/salt/top.sls index 2cf2443d1..bbd2a862d 100644 --- a/salt/top.sls +++ b/salt/top.sls @@ -22,6 +22,7 @@ base: 'not G@saltversion:{{saltversion}}': - match: compound + - salt.minion-state-apply-test {% if ISAIRGAP is sameas true %} - airgap {% endif %}