add configurable realert threshold per agent

This commit is contained in:
reyesj2
2025-09-10 10:56:09 -05:00
parent 29980ea958
commit fbdc0c4705
3 changed files with 60 additions and 0 deletions

View File

@@ -11,5 +11,6 @@ manager:
critical_agents: [] critical_agents: []
custom_kquery: custom_kquery:
offline_threshold: 5 offline_threshold: 5
realert_threshold: 5
page_size: 250 page_size: 250
run_interval: 5 run_interval: 5

View File

@@ -61,6 +61,11 @@ manager:
global: True global: True
helpLink: elastic-fleet.html helpLink: elastic-fleet.html
forcedType: int forcedType: int
realert_threshold:
description: The time to pass before another alert for an offline agent exceeding the offline_threshold is generated.
global: True
helpLink: elastic-fleet.html
forcedType: int
page_size: page_size:
description: The amount of agents that can be processed per API request to fleet. description: The amount of agents that can be processed per API request to fleet.
global: True global: True

View File

@@ -2,6 +2,7 @@
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%} {%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%} {%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%} {%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%}
#!/bin/bash #!/bin/bash
set -euo pipefail set -euo pipefail
@@ -17,6 +18,7 @@ CRITICAL_AGENTS_FILE="/dev/null"
CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
{%- endif %} {%- endif %}
OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
REALERT_THRESHOLD={{ REALERT_THRESHOLD }}
PAGE_SIZE="{{ PAGE_SIZE }}" PAGE_SIZE="{{ PAGE_SIZE }}"
log_message() { log_message() {
@@ -71,6 +73,52 @@ calculate_offline_hours() {
echo $((diff / 3600)) echo $((diff / 3600))
} }
check_recent_log_entries() {
local agent_hostname="$1"
if [ ! -f "$LOG_FILE" ]; then
return 1
fi
local current_time=$(date +%s)
local threshold_seconds=$((REALERT_THRESHOLD * 3600))
local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]')
local most_recent_timestamp=""
while IFS= read -r line; do
[ -z "$line" ] && continue
local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null)
local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null)
[ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue
local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]')
if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then
most_recent_timestamp="$logged_timestamp"
fi
done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null)
# If there is agent entry (within last 1000), check the time difference
if [ -n "$most_recent_timestamp" ]; then
local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0")
if [ "$logged_time" -ne "0" ]; then
local time_diff=$((current_time - logged_time))
local hours_diff=$((time_diff / 3600))
# Skip if last agent timestamp was more recent than realert threshold
if ((hours_diff < REALERT_THRESHOLD)); then
return 0
fi
fi
fi
# Agent has not been logged within realert threshold
return 1
}
main() { main() {
log_message "INFO" "Starting Fleet agent status check" log_message "INFO" "Starting Fleet agent status check"
@@ -150,6 +198,12 @@ main() {
continue continue
fi fi
# Check if this agent was already logged within the realert_threshold
if check_recent_log_entries "$agent_hostname"; then
log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h"
continue
fi
log_entry=$(echo 'null' | jq -c \ log_entry=$(echo 'null' | jq -c \
--arg ts "$current_timestamp" \ --arg ts "$current_timestamp" \
--arg id "$agent_id" \ --arg id "$agent_id" \