elastic agent offline alerter

Signed-off-by: reyesj2 <94730068+reyesj2@users.noreply.github.com>
2026-01-23 08:31:30 +01:00 · 2025-09-02 17:00:03 -05:00
parent d9127a288f
commit e26310d172
7 changed files with 420 additions and 0 deletions
--- a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
+++ b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
@@ -0,0 +1,48 @@
+{
+  "package": {
+    "name": "filestream",
+    "version": ""
+  },
+  "name": "agent-monitor",
+  "namespace": "",
+  "description": "",
+  "policy_ids": [
+    "so-grid-nodes_general"
+  ],
+  "output_id": null,
+  "vars": {},
+  "inputs": {
+    "filestream-filestream": {
+      "enabled": true,
+      "streams": {
+        "filestream.generic": {
+          "enabled": true,
+          "vars": {
+            "paths": [
+              "/opt/so/log/agents/agent-monitor-*.log"
+            ],
+            "data_stream.dataset": "agent-monitor",
+            "pipeline": "elasticagent.monitor",
+            "parsers": "",
+            "exclude_files": [
+              "\\.gz$"
+            ],
+            "include_files": [],
+            "processors": "- decode_json_fields:\n    fields: [\"message\"]\n    target: \"\"\n- add_fields:\n    target: event\n    fields:\n        module: gridmetrics",
+            "tags": [],
+            "recursive_glob": true,
+            "ignore_older": "72h",
+            "clean_inactive": -1,
+            "harvester_limit": 0,
+            "fingerprint": true,
+            "fingerprint_offset": 0,
+            "fingerprint_length": 1024,
+            "file_identity_native": false,
+            "exclude_lines": [],
+            "include_lines": []
+          }
+        }
+      }
+    }
+  }
+}
--- a/salt/elasticsearch/defaults.yaml
+++ b/salt/elasticsearch/defaults.yaml
@@ -1243,6 +1243,70 @@ elasticsearch:
              set_priority:
                priority: 50
            min_age: 30d
+    so-logs-agent-monitor:
+      index_sorting: false
+      index_template:
+        composed_of:
+        - event-mappings
+        - so-elastic-agent-monitor
+        - so-fleet_integrations.ip_mappings-1
+        - so-fleet_globals-1
+        - so-fleet_agent_id_verification-1
+        data_stream:
+          allow_custom_routing: false
+          hidden: false
+        ignore_missing_component_templates:
+        - logs-agent-monitor@custom
+        index_patterns:
+        - logs-agent-monitor-*
+        priority: 501
+        template:
+          mappings:
+            _meta:
+              managed: true
+              managed_by: security_onion
+              package:
+                name: elastic_agent
+          settings:
+            index:
+              lifecycle:
+                name: so-logs-agent-monitor-logs
+              mapping:
+                total_fields:
+                  limit: 5000
+              number_of_replicas: 0
+              sort:
+                field: '@timestamp'
+                order: desc
+      policy:
+        _meta:
+          managed: true
+          managed_by: security_onion
+          package:
+            name: elastic_agent
+        phases:
+          cold:
+            actions:
+              set_priority:
+                priority: 0
+            min_age: 60d
+          delete:
+            actions:
+              delete: {}
+            min_age: 365d
+          hot:
+            actions:
+              rollover:
+                max_age: 30d
+                max_primary_shard_size: 50gb
+              set_priority:
+                priority: 100
+            min_age: 0ms
+          warm:
+            actions:
+              set_priority:
+                priority: 50
+            min_age: 30d
    so-logs-elastic_agent_x_apm_server:
      index_sorting: false
      index_template:
--- a/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
+++ b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
@@ -0,0 +1,43 @@
+{
+  "template": {
+    "mappings": {
+      "properties": {
+        "agent": {
+          "type": "object",
+          "properties": {
+            "hostname": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "id": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "last_checkin_status": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "last_checkin": {
+              "type": "date"
+            },
+            "name": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "offline_duration_hours": {
+              "type": "integer"
+            },
+            "policy_id": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "status": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/salt/manager/defaults.yaml
+++ b/salt/manager/defaults.yaml
@@ -5,3 +5,10 @@ manager:
    minute: 0
  additionalCA: ''
  insecureSkipVerify: False
+  agent_monitoring:
+    enabled: False
+    config:
+      critical_agents: []
+      offline_threshold: 5
+      page_size: 250
+      run_interval: 5
--- a/salt/manager/init.sls
+++ b/salt/manager/init.sls
@@ -34,6 +34,26 @@ agents_log_dir:
      - user
      - group

+agents_conf_dir:
+  file.directory:
+    - name: /opt/so/conf/agents
+    - user: root
+    - group: root
+    - recurse:
+      - user
+      - group
+
+{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %}
+critical_agents_patterns:
+  file.managed:
+    - name: /opt/so/conf/agents/critical-agents.txt
+    - contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }}
+{% else %}
+remove_critical_agents_config:
+  file.absent:
+    - name: /opt/so/conf/agents/critical-agents.txt
+{% endif %}
+
 yara_log_dir:
  file.directory:
    - name: /opt/so/log/yarasync
@@ -127,6 +147,21 @@ so_fleetagent_status:
    - month: '*'
    - dayweek: '*'

+so_fleetagent_monitor:
+{% if MANAGERMERGED.agent_monitoring.enabled %}
+  cron.present:
+{% else %}
+  cron.absent:
+{% endif %}
+  - name: /usr/sbin/so-elastic-agent-monitor
+  - identifier: so_fleetagent_monitor
+  - user: root
+  - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}'
+  - hour: '*'
+  - daymonth: '*'
+  - month: '*'
+  - dayweek: '*'
+
 socore_own_saltstack_default:
  file.directory:
    - name: /opt/so/saltstack/default
--- a/salt/manager/soc_manager.yaml
+++ b/salt/manager/soc_manager.yaml
@@ -37,3 +37,33 @@ manager:
    forcedType: bool
    global: True
    helpLink: proxy.html
+  agent_monitoring:
+    enabled:
+      description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold.
+      global: True
+      helpLink: elastic-fleet.html
+      forcedType: bool
+    config:
+      critical_agents:
+        description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold
+        global: True
+        multiline: True
+        helpLink: elastic-fleet.html
+        forcedType: "[]string"
+      offline_threshold:
+        description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+      page_size:
+        description: The amount of agents that can be processed per API request to fleet.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+        advanced: True
+      run_interval:
+        description: The time in minutes between checking fleet agent statuses.
+        global: True
+        advanced: True
+        helpLink: elastic-fleet.html
+        forcedType: int
--- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
+++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+{% from 'manager/map.jinja' import MANAGERMERGED %}
+{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %}
+{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %}
+
+set -euo pipefail
+
+LOG_DIR="/opt/so/log/agents"
+LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
+CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
+FLEET_API="http://localhost:5601/api/fleet/agents"
+CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
+
+OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
+PAGE_SIZE="{{ PAGE_SIZE }}"
+
+log_message() {
+    local level="$1"
+    local message="$2"
+    echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
+}
+
+matches_critical_pattern() {
+    local hostname="$1"
+    local pattern_file="$2"
+    
+    # If critical agents file doesn't exist or is empty, match all
+    if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
+        return 0
+    fi
+    
+    local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
+    
+    while IFS= read -r pattern || [ -n "$pattern" ]; do
+        # empty lines and comments
+        [[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
+        
+        # cut whitespace
+        pattern=$(echo "$pattern" | xargs)
+        
+        local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
+        
+        # Replace * with bash wildcard
+        local bash_pattern="${pattern_lower//\*/.*}"
+        
+        # Check if hostname matches the pattern
+        if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
+            return 0
+        fi
+    done < "$pattern_file"
+    
+    return 1
+}
+
+calculate_offline_hours() {
+    local last_checkin="$1"
+    local current_time=$(date +%s)
+    local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
+    
+    if [ "$checkin_time" -eq "0" ]; then
+        echo "0"
+        return
+    fi
+    
+    local diff=$((current_time - checkin_time))
+    echo $((diff / 3600))
+}
+
+cleanup_old_logs() {
+    # Find and delete log files older than 7 days
+    local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null)
+
+    if [ -n "$old_files" ]; then
+        local deleted_count=$(echo "$old_files" | wc -l)
+        echo "$old_files" | xargs rm -f
+        log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)"
+    fi
+}
+
+main() {
+    log_message "INFO" "Starting Fleet agent status check"
+    
+    # Check if critical agents file is configured
+    if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
+        log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
+        log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
+    else
+        log_message "INFO" "No critical agents filter found, monitoring all agents"
+    fi
+    
+    cleanup_old_logs
+
+    log_message "INFO" "Querying Fleet API"
+
+    local page=1
+    local total_agents=0
+    local processed_agents=0
+    local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+
+    while true; do
+        log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
+
+        if ! response_body=$(curl -K "$CURL_CONFIG" \
+            -s --fail \
+            "${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \
+            -H 'kbn-xsrf: true' 2>/dev/null); then
+            log_message "ERROR" "Failed to query Fleet API (page $page)"
+            exit 1
+        fi
+
+        # pagination info
+        current_total=$(echo "$response_body" | jq -r '.total // 0')
+        current_page=$(echo "$response_body" | jq -r '.page // 1')
+        agents_in_page=$(echo "$response_body" | jq -r '.list | length')
+
+        # Update total
+        if [ "$page" -eq 1 ]; then
+            total_agents="$current_total"
+            log_message "INFO" "Found $total_agents total agents across all pages"
+        fi
+
+        log_message "INFO" "Processing page $current_page with $agents_in_page agents"
+
+        # Process agents from current page
+        echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do
+        # Grab agent details
+        agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
+        agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
+        agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
+        agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
+        last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
+        last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
+        policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
+
+        # Only log agents that are offline or degraded (skip inactive agents)
+	  # Fleetserver agents can show multiple versions as 'inactive'
+        if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
+            # Check if agent matches critical agent patterns (if configured)
+            if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
+                continue  # Skip this agent if it doesn't match any critical agent pattern
+            fi
+            
+            offline_hours=$(calculate_offline_hours "$last_checkin")
+            
+            log_entry=$(jq -c \
+                --arg ts "$current_timestamp" \
+                --arg id "$agent_id" \
+                --arg hostname "$agent_hostname" \
+                --arg name "$agent_name" \
+                --arg status "$agent_status" \
+                --arg last_checkin "$last_checkin" \
+                --arg last_checkin_status "$last_checkin_status" \
+                --arg policy_id "$policy_id" \
+                --arg offline_hours "$offline_hours" \
+                '{
+                    "@timestamp": $ts,
+                    "agent.id": $id,
+                    "agent.hostname": $hostname,
+                    "agent.name": $name,
+                    "agent.status": $status,
+                    "agent.last_checkin": $last_checkin,
+                    "agent.last_checkin_status": $last_checkin_status,
+                    "agent.policy_id": $policy_id,
+                    "agent.offline_duration_hours": ($offline_hours | tonumber)
+                }')
+            
+            echo "$log_entry" >> "$LOG_FILE"
+            
+            log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
+        fi
+        done
+
+        processed_agents=$((processed_agents + agents_in_page))
+
+        if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
+            log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
+            break
+        fi
+
+        page=$((page + 1))
+
+	# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
+        if [ "$page" -gt 100 ]; then
+            log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
+            break
+        fi
+    done
+    
+    log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
+}
+
+main "$@"