custom kquery

This commit is contained in:
reyesj2
2025-09-04 15:37:28 -05:00
parent 1a32a0897c
commit dfec29d18e
5 changed files with 113 additions and 54 deletions

View File

@@ -38,6 +38,7 @@ elasticfleet:
- elasticsearch - elasticsearch
- endpoint - endpoint
- fleet_server - fleet_server
- filestream
- http_endpoint - http_endpoint
- httpjson - httpjson
- log - log

View File

@@ -0,0 +1,36 @@
{
"processors": [
{
"set": {
"field": "event.dataset",
"value": "gridmetrics.agents",
"ignore_failure": true
}
},
{
"set": {
"field": "event.module",
"value": "gridmetrics",
"ignore_failure": true
}
},
{
"remove": {
"field": [
"host",
"elastic_agent",
"agent"
],
"ignore_missing": true,
"ignore_failure": true
}
},
{
"json": {
"field": "message",
"add_to_root": true,
"ignore_failure": true
}
}
]
}

View File

@@ -9,6 +9,7 @@ manager:
enabled: False enabled: False
config: config:
critical_agents: [] critical_agents: []
custom_kquery:
offline_threshold: 5 offline_threshold: 5
page_size: 250 page_size: 250
run_interval: 5 run_interval: 5

View File

@@ -45,11 +45,17 @@ manager:
forcedType: bool forcedType: bool
config: config:
critical_agents: critical_agents:
description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold.
global: True global: True
multiline: True multiline: True
helpLink: elastic-fleet.html helpLink: elastic-fleet.html
forcedType: "[]string" forcedType: "[]string"
custom_kquery:
description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA'
global: True
helpLink: elastic-fleet.html
forcedType: string
advanced: True
offline_threshold: offline_threshold:
description: The maximum allowed time in hours a 'critical' agent has been offline before being logged. description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
global: True global: True

View File

@@ -1,17 +1,21 @@
{%- from 'manager/map.jinja' import MANAGERMERGED -%}
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
#!/bin/bash #!/bin/bash
{% from 'manager/map.jinja' import MANAGERMERGED %}
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %}
set -euo pipefail set -euo pipefail
LOG_DIR="/opt/so/log/agents" LOG_DIR="/opt/so/log/agents"
LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log" LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config" CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
FLEET_API="http://localhost:5601/api/fleet/agents" FLEET_API="http://localhost:5601/api/fleet/agents"
{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
CRITICAL_AGENTS_FILE="/dev/null"
{%- else %}
CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
{%- endif %}
OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
PAGE_SIZE="{{ PAGE_SIZE }}" PAGE_SIZE="{{ PAGE_SIZE }}"
@@ -80,7 +84,7 @@ cleanup_old_logs() {
main() { main() {
log_message "INFO" "Starting Fleet agent status check" log_message "INFO" "Starting Fleet agent status check"
# Check if critical agents file is configured # Check if critical agents file is configured
if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE" log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
@@ -98,12 +102,20 @@ main() {
local processed_agents=0 local processed_agents=0
local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}"
FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}"
{%- else %}
log_message "INFO" "Using default query (all offline or degraded agents)"
FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}"
{%- endif %}
while true; do while true; do
log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)" log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
if ! response_body=$(curl -K "$CURL_CONFIG" \ if ! response_body=$(curl -K "$CURL_CONFIG" \
-s --fail \ -s --fail \
"${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \ $FLEET_QUERY \
-H 'kbn-xsrf: true' 2>/dev/null); then -H 'kbn-xsrf: true' 2>/dev/null); then
log_message "ERROR" "Failed to query Fleet API (page $page)" log_message "ERROR" "Failed to query Fleet API (page $page)"
exit 1 exit 1
@@ -123,52 +135,55 @@ main() {
log_message "INFO" "Processing page $current_page with $agents_in_page agents" log_message "INFO" "Processing page $current_page with $agents_in_page agents"
# Process agents from current page # Process agents from current page
echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do mapfile -t agents < <(echo "$response_body" | jq -c '.list[]')
# Grab agent details
agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
# Only log agents that are offline or degraded (skip inactive agents) for agent in "${agents[@]}"; do
# Fleetserver agents can show multiple versions as 'inactive' # Grab agent details
if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
# Check if agent matches critical agent patterns (if configured) agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
continue # Skip this agent if it doesn't match any critical agent pattern agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
# Only log agents that are offline or degraded (skip inactive agents)
# Fleetserver agents can show multiple versions as 'inactive'
if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
# Check if agent matches critical agent patterns (if configured)
if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent"
continue # Skip this agent if it doesn't match any critical agent pattern
fi
offline_hours=$(calculate_offline_hours "$last_checkin")
log_entry=$(echo 'null' | jq -c \
--arg ts "$current_timestamp" \
--arg id "$agent_id" \
--arg hostname "$agent_hostname" \
--arg name "$agent_name" \
--arg status "$agent_status" \
--arg last_checkin "$last_checkin" \
--arg last_checkin_status "$last_checkin_status" \
--arg policy_id "$policy_id" \
--arg offline_hours "$offline_hours" \
'{
"@timestamp": $ts,
"agent.id": $id,
"agent.hostname": $hostname,
"agent.name": $name,
"agent.status": $status,
"agent.last_checkin": $last_checkin,
"agent.last_checkin_status": $last_checkin_status,
"agent.policy_id": $policy_id,
"agent.offline_duration_hours": ($offline_hours | tonumber)
}')
echo "$log_entry" >> "$LOG_FILE"
log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
fi fi
offline_hours=$(calculate_offline_hours "$last_checkin")
log_entry=$(jq -c \
--arg ts "$current_timestamp" \
--arg id "$agent_id" \
--arg hostname "$agent_hostname" \
--arg name "$agent_name" \
--arg status "$agent_status" \
--arg last_checkin "$last_checkin" \
--arg last_checkin_status "$last_checkin_status" \
--arg policy_id "$policy_id" \
--arg offline_hours "$offline_hours" \
'{
"@timestamp": $ts,
"agent.id": $id,
"agent.hostname": $hostname,
"agent.name": $name,
"agent.status": $status,
"agent.last_checkin": $last_checkin,
"agent.last_checkin_status": $last_checkin_status,
"agent.policy_id": $policy_id,
"agent.offline_duration_hours": ($offline_hours | tonumber)
}')
echo "$log_entry" >> "$LOG_FILE"
log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
fi
done done
processed_agents=$((processed_agents + agents_in_page)) processed_agents=$((processed_agents + agents_in_page))
@@ -180,13 +195,13 @@ main() {
page=$((page + 1)) page=$((page + 1))
# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
if [ "$page" -gt 100 ]; then if [ "$page" -gt 100 ]; then
log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size" log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
break break
fi fi
done done
log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents" log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
} }