diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml index 237ac2999..239075f74 100644 --- a/salt/manager/defaults.yaml +++ b/salt/manager/defaults.yaml @@ -11,5 +11,6 @@ manager: critical_agents: [] custom_kquery: offline_threshold: 5 + realert_threshold: 5 page_size: 250 run_interval: 5 diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml index ac06ac2b4..f0d699f58 100644 --- a/salt/manager/soc_manager.yaml +++ b/salt/manager/soc_manager.yaml @@ -61,6 +61,11 @@ manager: global: True helpLink: elastic-fleet.html forcedType: int + realert_threshold: + description: The time to pass before another alert for an offline agent exceeding the offline_threshold is generated. + global: True + helpLink: elastic-fleet.html + forcedType: int page_size: description: The amount of agents that can be processed per API request to fleet. global: True diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor index 3fa3221c2..f8c3162b4 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -2,6 +2,7 @@ {%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%} {%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%} {%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%} +{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%} #!/bin/bash set -euo pipefail @@ -17,6 +18,7 @@ CRITICAL_AGENTS_FILE="/dev/null" CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" {%- endif %} OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} +REALERT_THRESHOLD={{ REALERT_THRESHOLD }} PAGE_SIZE="{{ PAGE_SIZE }}" log_message() { @@ -71,6 +73,52 @@ calculate_offline_hours() { echo $((diff / 3600)) } +check_recent_log_entries() { + local agent_hostname="$1" + + if [ ! -f "$LOG_FILE" ]; then + return 1 + fi + + local current_time=$(date +%s) + local threshold_seconds=$((REALERT_THRESHOLD * 3600)) + local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]') + local most_recent_timestamp="" + + while IFS= read -r line; do + [ -z "$line" ] && continue + + local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null) + local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null) + + [ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue + + local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]') + + if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then + most_recent_timestamp="$logged_timestamp" + fi + done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null) + + # If there is agent entry (within last 1000), check the time difference + if [ -n "$most_recent_timestamp" ]; then + local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0") + + if [ "$logged_time" -ne "0" ]; then + local time_diff=$((current_time - logged_time)) + local hours_diff=$((time_diff / 3600)) + + # Skip if last agent timestamp was more recent than realert threshold + if ((hours_diff < REALERT_THRESHOLD)); then + return 0 + fi + fi + fi + + # Agent has not been logged within realert threshold + return 1 +} + main() { log_message "INFO" "Starting Fleet agent status check" @@ -150,6 +198,12 @@ main() { continue fi + # Check if this agent was already logged within the realert_threshold + if check_recent_log_entries "$agent_hostname"; then + log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h" + continue + fi + log_entry=$(echo 'null' | jq -c \ --arg ts "$current_timestamp" \ --arg id "$agent_id" \