Merge branch '2.4/dev' into vlb2

This commit is contained in:
Jason Ertel
2025-09-05 17:11:10 -04:00
11 changed files with 482 additions and 0 deletions

View File

@@ -38,6 +38,7 @@ elasticfleet:
- elasticsearch
- endpoint
- fleet_server
- filestream
- http_endpoint
- httpjson
- log

View File

@@ -0,0 +1,48 @@
{
"package": {
"name": "filestream",
"version": ""
},
"name": "agent-monitor",
"namespace": "",
"description": "",
"policy_ids": [
"so-grid-nodes_general"
],
"output_id": null,
"vars": {},
"inputs": {
"filestream-filestream": {
"enabled": true,
"streams": {
"filestream.generic": {
"enabled": true,
"vars": {
"paths": [
"/opt/so/log/agents/agent-monitor-*.log"
],
"data_stream.dataset": "agent-monitor",
"pipeline": "elasticagent.monitor",
"parsers": "",
"exclude_files": [
"\\.gz$"
],
"include_files": [],
"processors": "- decode_json_fields:\n fields: [\"message\"]\n target: \"\"\n- add_fields:\n target: event\n fields:\n module: gridmetrics",
"tags": [],
"recursive_glob": true,
"ignore_older": "72h",
"clean_inactive": -1,
"harvester_limit": 0,
"fingerprint": true,
"fingerprint_offset": 0,
"fingerprint_length": 1024,
"file_identity_native": false,
"exclude_lines": [],
"include_lines": []
}
}
}
}
}
}

View File

@@ -1243,6 +1243,70 @@ elasticsearch:
set_priority:
priority: 50
min_age: 30d
so-logs-agent-monitor:
index_sorting: false
index_template:
composed_of:
- event-mappings
- so-elastic-agent-monitor
- so-fleet_integrations.ip_mappings-1
- so-fleet_globals-1
- so-fleet_agent_id_verification-1
data_stream:
allow_custom_routing: false
hidden: false
ignore_missing_component_templates:
- logs-agent-monitor@custom
index_patterns:
- logs-agent-monitor-*
priority: 501
template:
mappings:
_meta:
managed: true
managed_by: security_onion
package:
name: elastic_agent
settings:
index:
lifecycle:
name: so-logs-agent-monitor-logs
mapping:
total_fields:
limit: 5000
number_of_replicas: 0
sort:
field: '@timestamp'
order: desc
policy:
_meta:
managed: true
managed_by: security_onion
package:
name: elastic_agent
phases:
cold:
actions:
set_priority:
priority: 0
min_age: 60d
delete:
actions:
delete: {}
min_age: 365d
hot:
actions:
rollover:
max_age: 30d
max_primary_shard_size: 50gb
set_priority:
priority: 100
min_age: 0ms
warm:
actions:
set_priority:
priority: 50
min_age: 30d
so-logs-elastic_agent_x_apm_server:
index_sorting: false
index_template:

View File

@@ -0,0 +1,36 @@
{
"processors": [
{
"set": {
"field": "event.dataset",
"value": "gridmetrics.agents",
"ignore_failure": true
}
},
{
"set": {
"field": "event.module",
"value": "gridmetrics",
"ignore_failure": true
}
},
{
"remove": {
"field": [
"host",
"elastic_agent",
"agent"
],
"ignore_missing": true,
"ignore_failure": true
}
},
{
"json": {
"field": "message",
"add_to_root": true,
"ignore_failure": true
}
}
]
}

View File

@@ -0,0 +1,43 @@
{
"template": {
"mappings": {
"properties": {
"agent": {
"type": "object",
"properties": {
"hostname": {
"ignore_above": 1024,
"type": "keyword"
},
"id": {
"ignore_above": 1024,
"type": "keyword"
},
"last_checkin_status": {
"ignore_above": 1024,
"type": "keyword"
},
"last_checkin": {
"type": "date"
},
"name": {
"ignore_above": 1024,
"type": "keyword"
},
"offline_duration_hours": {
"type": "integer"
},
"policy_id": {
"ignore_above": 1024,
"type": "keyword"
},
"status": {
"ignore_above": 1024,
"type": "keyword"
}
}
}
}
}
}
}

View File

@@ -268,3 +268,12 @@ logrotate:
- nocompress
- create
- sharedscripts
/opt/so/log/agents/agent-monitor*_x_log:
- daily
- rotate 14
- missingok
- compress
- create
- extension .log
- dateext
- dateyesterday

View File

@@ -175,3 +175,10 @@ logrotate:
multiline: True
global: True
forcedType: "[]string"
"/opt/so/log/agents/agent-monitor*_x_log":
description: List of logrotate options for this file.
title: /opt/so/log/agents/agent-monitor*.log
advanced: True
multiline: True
global: True
forcedType: "[]string"

View File

@@ -5,3 +5,11 @@ manager:
minute: 0
additionalCA: ''
insecureSkipVerify: False
agent_monitoring:
enabled: False
config:
critical_agents: []
custom_kquery:
offline_threshold: 5
page_size: 250
run_interval: 5

View File

@@ -34,6 +34,26 @@ agents_log_dir:
- user
- group
agents_conf_dir:
file.directory:
- name: /opt/so/conf/agents
- user: root
- group: root
- recurse:
- user
- group
{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %}
critical_agents_patterns:
file.managed:
- name: /opt/so/conf/agents/critical-agents.txt
- contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }}
{% else %}
remove_critical_agents_config:
file.absent:
- name: /opt/so/conf/agents/critical-agents.txt
{% endif %}
yara_log_dir:
file.directory:
- name: /opt/so/log/yarasync
@@ -127,6 +147,21 @@ so_fleetagent_status:
- month: '*'
- dayweek: '*'
so_fleetagent_monitor:
{% if MANAGERMERGED.agent_monitoring.enabled %}
cron.present:
{% else %}
cron.absent:
{% endif %}
- name: /bin/flock -n /opt/so/log/agents/agent-monitor.lock /usr/sbin/so-elastic-agent-monitor
- identifier: so_fleetagent_monitor
- user: root
- minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}'
- hour: '*'
- daymonth: '*'
- month: '*'
- dayweek: '*'
socore_own_saltstack_default:
file.directory:
- name: /opt/so/saltstack/default

View File

@@ -37,3 +37,39 @@ manager:
forcedType: bool
global: True
helpLink: proxy.html
agent_monitoring:
enabled:
description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold.
global: True
helpLink: elastic-fleet.html
forcedType: bool
config:
critical_agents:
description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold.
global: True
multiline: True
helpLink: elastic-fleet.html
forcedType: "[]string"
custom_kquery:
description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA'
global: True
helpLink: elastic-fleet.html
forcedType: string
advanced: True
offline_threshold:
description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
global: True
helpLink: elastic-fleet.html
forcedType: int
page_size:
description: The amount of agents that can be processed per API request to fleet.
global: True
helpLink: elastic-fleet.html
forcedType: int
advanced: True
run_interval:
description: The time in minutes between checking fleet agent statuses.
global: True
advanced: True
helpLink: elastic-fleet.html
forcedType: int

View File

@@ -0,0 +1,195 @@
{%- from 'manager/map.jinja' import MANAGERMERGED -%}
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
#!/bin/bash
set -euo pipefail
LOG_DIR="/opt/so/log/agents"
LOG_FILE="$LOG_DIR/agent-monitor.log"
CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
FLEET_API="http://localhost:5601/api/fleet/agents"
{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
CRITICAL_AGENTS_FILE="/dev/null"
{%- else %}
CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
{%- endif %}
OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
PAGE_SIZE="{{ PAGE_SIZE }}"
log_message() {
local level="$1"
local message="$2"
echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
}
matches_critical_pattern() {
local hostname="$1"
local pattern_file="$2"
# If critical agents file doesn't exist or is empty, match all
if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
return 0
fi
local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
while IFS= read -r pattern || [ -n "$pattern" ]; do
# empty lines and comments
[[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
# cut whitespace
pattern=$(echo "$pattern" | xargs)
local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
# Replace * with bash wildcard
local bash_pattern="${pattern_lower//\*/.*}"
# Check if hostname matches the pattern
if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
return 0
fi
done < "$pattern_file"
return 1
}
calculate_offline_hours() {
local last_checkin="$1"
local current_time=$(date +%s)
local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
if [ "$checkin_time" -eq "0" ]; then
echo "0"
return
fi
local diff=$((current_time - checkin_time))
echo $((diff / 3600))
}
main() {
log_message "INFO" "Starting Fleet agent status check"
# Check if critical agents file is configured
if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
else
log_message "INFO" "No critical agents filter found, monitoring all agents"
fi
log_message "INFO" "Querying Fleet API"
local page=1
local total_agents=0
local processed_agents=0
local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}"
FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}"
{%- else %}
log_message "INFO" "Using default query (all offline or degraded agents)"
FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}"
{%- endif %}
while true; do
log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
if ! response_body=$(curl -K "$CURL_CONFIG" \
-s --fail \
"$FLEET_QUERY" \
-H 'kbn-xsrf: true' 2>/dev/null); then
log_message "ERROR" "Failed to query Fleet API (page $page)"
exit 1
fi
# pagination info
current_total=$(echo "$response_body" | jq -r '.total // 0')
current_page=$(echo "$response_body" | jq -r '.page // 1')
agents_in_page=$(echo "$response_body" | jq -r '.list | length')
# Update total
if [ "$page" -eq 1 ]; then
total_agents="$current_total"
log_message "INFO" "Found $total_agents total agents across all pages"
fi
log_message "INFO" "Processing page $current_page with $agents_in_page agents"
# Process agents from current page
mapfile -t agents < <(echo "$response_body" | jq -c '.list[]')
for agent in "${agents[@]}"; do
# Grab agent details
agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
# Only log agents that are offline or degraded (skip inactive agents)
# Fleetserver agents can show multiple versions as 'inactive'
if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
# Check if agent matches critical agent patterns (if configured)
if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent"
continue # Skip this agent if it doesn't match any critical agent pattern
fi
offline_hours=$(calculate_offline_hours "$last_checkin")
log_entry=$(echo 'null' | jq -c \
--arg ts "$current_timestamp" \
--arg id "$agent_id" \
--arg hostname "$agent_hostname" \
--arg name "$agent_name" \
--arg status "$agent_status" \
--arg last_checkin "$last_checkin" \
--arg last_checkin_status "$last_checkin_status" \
--arg policy_id "$policy_id" \
--arg offline_hours "$offline_hours" \
'{
"@timestamp": $ts,
"agent.id": $id,
"agent.hostname": $hostname,
"agent.name": $name,
"agent.status": $status,
"agent.last_checkin": $last_checkin,
"agent.last_checkin_status": $last_checkin_status,
"agent.policy_id": $policy_id,
"agent.offline_duration_hours": ($offline_hours | tonumber)
}')
echo "$log_entry" >> "$LOG_FILE"
log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
fi
done
processed_agents=$((processed_agents + agents_in_page))
if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
break
fi
page=$((page + 1))
# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
if [ "$page" -gt 100 ]; then
log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
break
fi
done
log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
}
main "$@"