diff --git a/salt/elasticfleet/defaults.yaml b/salt/elasticfleet/defaults.yaml index d6cdd7351..0220428bf 100644 --- a/salt/elasticfleet/defaults.yaml +++ b/salt/elasticfleet/defaults.yaml @@ -38,6 +38,7 @@ elasticfleet: - elasticsearch - endpoint - fleet_server + - filestream - http_endpoint - httpjson - log diff --git a/salt/elasticsearch/files/ingest/elasticagent.monitor b/salt/elasticsearch/files/ingest/elasticagent.monitor new file mode 100644 index 000000000..09d8297c4 --- /dev/null +++ b/salt/elasticsearch/files/ingest/elasticagent.monitor @@ -0,0 +1,36 @@ +{ + "processors": [ + { + "set": { + "field": "event.dataset", + "value": "gridmetrics.agents", + "ignore_failure": true + } + }, + { + "set": { + "field": "event.module", + "value": "gridmetrics", + "ignore_failure": true + } + }, + { + "remove": { + "field": [ + "host", + "elastic_agent", + "agent" + ], + "ignore_missing": true, + "ignore_failure": true + } + }, + { + "json": { + "field": "message", + "add_to_root": true, + "ignore_failure": true + } + } + ] +} \ No newline at end of file diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml index 65247d8ff..237ac2999 100644 --- a/salt/manager/defaults.yaml +++ b/salt/manager/defaults.yaml @@ -9,6 +9,7 @@ manager: enabled: False config: critical_agents: [] + custom_kquery: offline_threshold: 5 page_size: 250 run_interval: 5 diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml index f69f3f42a..ac06ac2b4 100644 --- a/salt/manager/soc_manager.yaml +++ b/salt/manager/soc_manager.yaml @@ -45,11 +45,17 @@ manager: forcedType: bool config: critical_agents: - description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold + description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold. global: True multiline: True helpLink: elastic-fleet.html forcedType: "[]string" + custom_kquery: + description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA' + global: True + helpLink: elastic-fleet.html + forcedType: string + advanced: True offline_threshold: description: The maximum allowed time in hours a 'critical' agent has been offline before being logged. global: True diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor index 572d4de4d..0f3bcac34 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -1,17 +1,21 @@ +{%- from 'manager/map.jinja' import MANAGERMERGED -%} +{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%} +{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%} +{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%} #!/bin/bash -{% from 'manager/map.jinja' import MANAGERMERGED %} -{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %} -{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %} - set -euo pipefail LOG_DIR="/opt/so/log/agents" LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log" CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config" FLEET_API="http://localhost:5601/api/fleet/agents" +{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #} +{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %} +CRITICAL_AGENTS_FILE="/dev/null" +{%- else %} CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" - +{%- endif %} OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} PAGE_SIZE="{{ PAGE_SIZE }}" @@ -80,7 +84,7 @@ cleanup_old_logs() { main() { log_message "INFO" "Starting Fleet agent status check" - + # Check if critical agents file is configured if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE" @@ -98,12 +102,20 @@ main() { local processed_agents=0 local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + {%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %} + log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}" + FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}" + {%- else %} + log_message "INFO" "Using default query (all offline or degraded agents)" + FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}" + {%- endif %} + while true; do log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)" if ! response_body=$(curl -K "$CURL_CONFIG" \ -s --fail \ - "${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \ + $FLEET_QUERY \ -H 'kbn-xsrf: true' 2>/dev/null); then log_message "ERROR" "Failed to query Fleet API (page $page)" exit 1 @@ -123,52 +135,55 @@ main() { log_message "INFO" "Processing page $current_page with $agents_in_page agents" # Process agents from current page - echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do - # Grab agent details - agent_id=$(echo "$agent" | jq -r '.id // "unknown"') - agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"') - agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"') - agent_status=$(echo "$agent" | jq -r '.status // "unknown"') - last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""') - last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"') - policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"') + mapfile -t agents < <(echo "$response_body" | jq -c '.list[]') - # Only log agents that are offline or degraded (skip inactive agents) - # Fleetserver agents can show multiple versions as 'inactive' - if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then - # Check if agent matches critical agent patterns (if configured) - if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then - continue # Skip this agent if it doesn't match any critical agent pattern + for agent in "${agents[@]}"; do + # Grab agent details + agent_id=$(echo "$agent" | jq -r '.id // "unknown"') + agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"') + agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"') + agent_status=$(echo "$agent" | jq -r '.status // "unknown"') + last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""') + last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"') + policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"') + + # Only log agents that are offline or degraded (skip inactive agents) + # Fleetserver agents can show multiple versions as 'inactive' + if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then + # Check if agent matches critical agent patterns (if configured) + if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then + log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent" + continue # Skip this agent if it doesn't match any critical agent pattern + fi + + offline_hours=$(calculate_offline_hours "$last_checkin") + + log_entry=$(echo 'null' | jq -c \ + --arg ts "$current_timestamp" \ + --arg id "$agent_id" \ + --arg hostname "$agent_hostname" \ + --arg name "$agent_name" \ + --arg status "$agent_status" \ + --arg last_checkin "$last_checkin" \ + --arg last_checkin_status "$last_checkin_status" \ + --arg policy_id "$policy_id" \ + --arg offline_hours "$offline_hours" \ + '{ + "@timestamp": $ts, + "agent.id": $id, + "agent.hostname": $hostname, + "agent.name": $name, + "agent.status": $status, + "agent.last_checkin": $last_checkin, + "agent.last_checkin_status": $last_checkin_status, + "agent.policy_id": $policy_id, + "agent.offline_duration_hours": ($offline_hours | tonumber) + }') + + echo "$log_entry" >> "$LOG_FILE" + + log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)" fi - - offline_hours=$(calculate_offline_hours "$last_checkin") - - log_entry=$(jq -c \ - --arg ts "$current_timestamp" \ - --arg id "$agent_id" \ - --arg hostname "$agent_hostname" \ - --arg name "$agent_name" \ - --arg status "$agent_status" \ - --arg last_checkin "$last_checkin" \ - --arg last_checkin_status "$last_checkin_status" \ - --arg policy_id "$policy_id" \ - --arg offline_hours "$offline_hours" \ - '{ - "@timestamp": $ts, - "agent.id": $id, - "agent.hostname": $hostname, - "agent.name": $name, - "agent.status": $status, - "agent.last_checkin": $last_checkin, - "agent.last_checkin_status": $last_checkin_status, - "agent.policy_id": $policy_id, - "agent.offline_duration_hours": ($offline_hours | tonumber) - }') - - echo "$log_entry" >> "$LOG_FILE" - - log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)" - fi done processed_agents=$((processed_agents + agents_in_page)) @@ -180,13 +195,13 @@ main() { page=$((page + 1)) - # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size + # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size if [ "$page" -gt 100 ]; then log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size" break fi done - + log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents" }