elastic agent offline alerter

Signed-off-by: reyesj2 <94730068+reyesj2@users.noreply.github.com>
This commit is contained in:
reyesj2
2025-09-02 17:00:03 -05:00
parent d9127a288f
commit e26310d172
7 changed files with 420 additions and 0 deletions

View File

@@ -0,0 +1,193 @@
#!/bin/bash
{% from 'manager/map.jinja' import MANAGERMERGED %}
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %}
set -euo pipefail
LOG_DIR="/opt/so/log/agents"
LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
FLEET_API="http://localhost:5601/api/fleet/agents"
CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
PAGE_SIZE="{{ PAGE_SIZE }}"
log_message() {
local level="$1"
local message="$2"
echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
}
matches_critical_pattern() {
local hostname="$1"
local pattern_file="$2"
# If critical agents file doesn't exist or is empty, match all
if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
return 0
fi
local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
while IFS= read -r pattern || [ -n "$pattern" ]; do
# empty lines and comments
[[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
# cut whitespace
pattern=$(echo "$pattern" | xargs)
local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
# Replace * with bash wildcard
local bash_pattern="${pattern_lower//\*/.*}"
# Check if hostname matches the pattern
if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
return 0
fi
done < "$pattern_file"
return 1
}
calculate_offline_hours() {
local last_checkin="$1"
local current_time=$(date +%s)
local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
if [ "$checkin_time" -eq "0" ]; then
echo "0"
return
fi
local diff=$((current_time - checkin_time))
echo $((diff / 3600))
}
cleanup_old_logs() {
# Find and delete log files older than 7 days
local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null)
if [ -n "$old_files" ]; then
local deleted_count=$(echo "$old_files" | wc -l)
echo "$old_files" | xargs rm -f
log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)"
fi
}
main() {
log_message "INFO" "Starting Fleet agent status check"
# Check if critical agents file is configured
if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
else
log_message "INFO" "No critical agents filter found, monitoring all agents"
fi
cleanup_old_logs
log_message "INFO" "Querying Fleet API"
local page=1
local total_agents=0
local processed_agents=0
local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
while true; do
log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
if ! response_body=$(curl -K "$CURL_CONFIG" \
-s --fail \
"${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \
-H 'kbn-xsrf: true' 2>/dev/null); then
log_message "ERROR" "Failed to query Fleet API (page $page)"
exit 1
fi
# pagination info
current_total=$(echo "$response_body" | jq -r '.total // 0')
current_page=$(echo "$response_body" | jq -r '.page // 1')
agents_in_page=$(echo "$response_body" | jq -r '.list | length')
# Update total
if [ "$page" -eq 1 ]; then
total_agents="$current_total"
log_message "INFO" "Found $total_agents total agents across all pages"
fi
log_message "INFO" "Processing page $current_page with $agents_in_page agents"
# Process agents from current page
echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do
# Grab agent details
agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
# Only log agents that are offline or degraded (skip inactive agents)
# Fleetserver agents can show multiple versions as 'inactive'
if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
# Check if agent matches critical agent patterns (if configured)
if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
continue # Skip this agent if it doesn't match any critical agent pattern
fi
offline_hours=$(calculate_offline_hours "$last_checkin")
log_entry=$(jq -c \
--arg ts "$current_timestamp" \
--arg id "$agent_id" \
--arg hostname "$agent_hostname" \
--arg name "$agent_name" \
--arg status "$agent_status" \
--arg last_checkin "$last_checkin" \
--arg last_checkin_status "$last_checkin_status" \
--arg policy_id "$policy_id" \
--arg offline_hours "$offline_hours" \
'{
"@timestamp": $ts,
"agent.id": $id,
"agent.hostname": $hostname,
"agent.name": $name,
"agent.status": $status,
"agent.last_checkin": $last_checkin,
"agent.last_checkin_status": $last_checkin_status,
"agent.policy_id": $policy_id,
"agent.offline_duration_hours": ($offline_hours | tonumber)
}')
echo "$log_entry" >> "$LOG_FILE"
log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
fi
done
processed_agents=$((processed_agents + agents_in_page))
if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
break
fi
page=$((page + 1))
# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
if [ "$page" -gt 100 ]; then
log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
break
fi
done
log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
}
main "$@"