elastic agent offline alerter

Signed-off-by: reyesj2 <94730068+reyesj2@users.noreply.github.com>
This commit is contained in:
reyesj2
2025-09-02 17:00:03 -05:00
parent d9127a288f
commit e26310d172
7 changed files with 420 additions and 0 deletions

View File

@@ -0,0 +1,48 @@
{
"package": {
"name": "filestream",
"version": ""
},
"name": "agent-monitor",
"namespace": "",
"description": "",
"policy_ids": [
"so-grid-nodes_general"
],
"output_id": null,
"vars": {},
"inputs": {
"filestream-filestream": {
"enabled": true,
"streams": {
"filestream.generic": {
"enabled": true,
"vars": {
"paths": [
"/opt/so/log/agents/agent-monitor-*.log"
],
"data_stream.dataset": "agent-monitor",
"pipeline": "elasticagent.monitor",
"parsers": "",
"exclude_files": [
"\\.gz$"
],
"include_files": [],
"processors": "- decode_json_fields:\n fields: [\"message\"]\n target: \"\"\n- add_fields:\n target: event\n fields:\n module: gridmetrics",
"tags": [],
"recursive_glob": true,
"ignore_older": "72h",
"clean_inactive": -1,
"harvester_limit": 0,
"fingerprint": true,
"fingerprint_offset": 0,
"fingerprint_length": 1024,
"file_identity_native": false,
"exclude_lines": [],
"include_lines": []
}
}
}
}
}
}

View File

@@ -1243,6 +1243,70 @@ elasticsearch:
set_priority:
priority: 50
min_age: 30d
so-logs-agent-monitor:
index_sorting: false
index_template:
composed_of:
- event-mappings
- so-elastic-agent-monitor
- so-fleet_integrations.ip_mappings-1
- so-fleet_globals-1
- so-fleet_agent_id_verification-1
data_stream:
allow_custom_routing: false
hidden: false
ignore_missing_component_templates:
- logs-agent-monitor@custom
index_patterns:
- logs-agent-monitor-*
priority: 501
template:
mappings:
_meta:
managed: true
managed_by: security_onion
package:
name: elastic_agent
settings:
index:
lifecycle:
name: so-logs-agent-monitor-logs
mapping:
total_fields:
limit: 5000
number_of_replicas: 0
sort:
field: '@timestamp'
order: desc
policy:
_meta:
managed: true
managed_by: security_onion
package:
name: elastic_agent
phases:
cold:
actions:
set_priority:
priority: 0
min_age: 60d
delete:
actions:
delete: {}
min_age: 365d
hot:
actions:
rollover:
max_age: 30d
max_primary_shard_size: 50gb
set_priority:
priority: 100
min_age: 0ms
warm:
actions:
set_priority:
priority: 50
min_age: 30d
so-logs-elastic_agent_x_apm_server:
index_sorting: false
index_template:

View File

@@ -0,0 +1,43 @@
{
"template": {
"mappings": {
"properties": {
"agent": {
"type": "object",
"properties": {
"hostname": {
"ignore_above": 1024,
"type": "keyword"
},
"id": {
"ignore_above": 1024,
"type": "keyword"
},
"last_checkin_status": {
"ignore_above": 1024,
"type": "keyword"
},
"last_checkin": {
"type": "date"
},
"name": {
"ignore_above": 1024,
"type": "keyword"
},
"offline_duration_hours": {
"type": "integer"
},
"policy_id": {
"ignore_above": 1024,
"type": "keyword"
},
"status": {
"ignore_above": 1024,
"type": "keyword"
}
}
}
}
}
}
}

View File

@@ -5,3 +5,10 @@ manager:
minute: 0
additionalCA: ''
insecureSkipVerify: False
agent_monitoring:
enabled: False
config:
critical_agents: []
offline_threshold: 5
page_size: 250
run_interval: 5

View File

@@ -34,6 +34,26 @@ agents_log_dir:
- user
- group
agents_conf_dir:
file.directory:
- name: /opt/so/conf/agents
- user: root
- group: root
- recurse:
- user
- group
{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %}
critical_agents_patterns:
file.managed:
- name: /opt/so/conf/agents/critical-agents.txt
- contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }}
{% else %}
remove_critical_agents_config:
file.absent:
- name: /opt/so/conf/agents/critical-agents.txt
{% endif %}
yara_log_dir:
file.directory:
- name: /opt/so/log/yarasync
@@ -127,6 +147,21 @@ so_fleetagent_status:
- month: '*'
- dayweek: '*'
so_fleetagent_monitor:
{% if MANAGERMERGED.agent_monitoring.enabled %}
cron.present:
{% else %}
cron.absent:
{% endif %}
- name: /usr/sbin/so-elastic-agent-monitor
- identifier: so_fleetagent_monitor
- user: root
- minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}'
- hour: '*'
- daymonth: '*'
- month: '*'
- dayweek: '*'
socore_own_saltstack_default:
file.directory:
- name: /opt/so/saltstack/default

View File

@@ -37,3 +37,33 @@ manager:
forcedType: bool
global: True
helpLink: proxy.html
agent_monitoring:
enabled:
description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold.
global: True
helpLink: elastic-fleet.html
forcedType: bool
config:
critical_agents:
description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold
global: True
multiline: True
helpLink: elastic-fleet.html
forcedType: "[]string"
offline_threshold:
description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
global: True
helpLink: elastic-fleet.html
forcedType: int
page_size:
description: The amount of agents that can be processed per API request to fleet.
global: True
helpLink: elastic-fleet.html
forcedType: int
advanced: True
run_interval:
description: The time in minutes between checking fleet agent statuses.
global: True
advanced: True
helpLink: elastic-fleet.html
forcedType: int

View File

@@ -0,0 +1,193 @@
#!/bin/bash
{% from 'manager/map.jinja' import MANAGERMERGED %}
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %}
set -euo pipefail
LOG_DIR="/opt/so/log/agents"
LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
FLEET_API="http://localhost:5601/api/fleet/agents"
CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
PAGE_SIZE="{{ PAGE_SIZE }}"
log_message() {
local level="$1"
local message="$2"
echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
}
matches_critical_pattern() {
local hostname="$1"
local pattern_file="$2"
# If critical agents file doesn't exist or is empty, match all
if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
return 0
fi
local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
while IFS= read -r pattern || [ -n "$pattern" ]; do
# empty lines and comments
[[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
# cut whitespace
pattern=$(echo "$pattern" | xargs)
local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
# Replace * with bash wildcard
local bash_pattern="${pattern_lower//\*/.*}"
# Check if hostname matches the pattern
if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
return 0
fi
done < "$pattern_file"
return 1
}
calculate_offline_hours() {
local last_checkin="$1"
local current_time=$(date +%s)
local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
if [ "$checkin_time" -eq "0" ]; then
echo "0"
return
fi
local diff=$((current_time - checkin_time))
echo $((diff / 3600))
}
cleanup_old_logs() {
# Find and delete log files older than 7 days
local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null)
if [ -n "$old_files" ]; then
local deleted_count=$(echo "$old_files" | wc -l)
echo "$old_files" | xargs rm -f
log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)"
fi
}
main() {
log_message "INFO" "Starting Fleet agent status check"
# Check if critical agents file is configured
if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
else
log_message "INFO" "No critical agents filter found, monitoring all agents"
fi
cleanup_old_logs
log_message "INFO" "Querying Fleet API"
local page=1
local total_agents=0
local processed_agents=0
local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
while true; do
log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
if ! response_body=$(curl -K "$CURL_CONFIG" \
-s --fail \
"${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \
-H 'kbn-xsrf: true' 2>/dev/null); then
log_message "ERROR" "Failed to query Fleet API (page $page)"
exit 1
fi
# pagination info
current_total=$(echo "$response_body" | jq -r '.total // 0')
current_page=$(echo "$response_body" | jq -r '.page // 1')
agents_in_page=$(echo "$response_body" | jq -r '.list | length')
# Update total
if [ "$page" -eq 1 ]; then
total_agents="$current_total"
log_message "INFO" "Found $total_agents total agents across all pages"
fi
log_message "INFO" "Processing page $current_page with $agents_in_page agents"
# Process agents from current page
echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do
# Grab agent details
agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
# Only log agents that are offline or degraded (skip inactive agents)
# Fleetserver agents can show multiple versions as 'inactive'
if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
# Check if agent matches critical agent patterns (if configured)
if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
continue # Skip this agent if it doesn't match any critical agent pattern
fi
offline_hours=$(calculate_offline_hours "$last_checkin")
log_entry=$(jq -c \
--arg ts "$current_timestamp" \
--arg id "$agent_id" \
--arg hostname "$agent_hostname" \
--arg name "$agent_name" \
--arg status "$agent_status" \
--arg last_checkin "$last_checkin" \
--arg last_checkin_status "$last_checkin_status" \
--arg policy_id "$policy_id" \
--arg offline_hours "$offline_hours" \
'{
"@timestamp": $ts,
"agent.id": $id,
"agent.hostname": $hostname,
"agent.name": $name,
"agent.status": $status,
"agent.last_checkin": $last_checkin,
"agent.last_checkin_status": $last_checkin_status,
"agent.policy_id": $policy_id,
"agent.offline_duration_hours": ($offline_hours | tonumber)
}')
echo "$log_entry" >> "$LOG_FILE"
log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
fi
done
processed_agents=$((processed_agents + agents_in_page))
if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
break
fi
page=$((page + 1))
# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
if [ "$page" -gt 100 ]; then
log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
break
fi
done
log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
}
main "$@"