From e26310d1727807f245e2d94f3caef7689149665b Mon Sep 17 00:00:00 2001
From: reyesj2 <94730068+reyesj2@users.noreply.github.com>
Date: Tue, 2 Sep 2025 17:00:03 -0500
Subject: [PATCH 1/4] elastic agent offline alerter

Signed-off-by: reyesj2 <94730068+reyesj2@users.noreply.github.com>
---
 .../elastic-agent-monitor.json                |  48 +++++
 salt/elasticsearch/defaults.yaml              |  64 ++++++
 .../so-elastic-agent-monitor.json             |  43 ++++
 salt/manager/defaults.yaml                    |   7 +
 salt/manager/init.sls                         |  35 ++++
 salt/manager/soc_manager.yaml                 |  30 +++
 .../tools/sbin_jinja/so-elastic-agent-monitor | 193 ++++++++++++++++++
 7 files changed, 420 insertions(+)
 create mode 100644 salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
 create mode 100644 salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
 create mode 100644 salt/manager/tools/sbin_jinja/so-elastic-agent-monitor

diff --git a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
new file mode 100644
index 000000000..a7d425b39
--- /dev/null
+++ b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
@@ -0,0 +1,48 @@
+{
+  "package": {
+    "name": "filestream",
+    "version": ""
+  },
+  "name": "agent-monitor",
+  "namespace": "",
+  "description": "",
+  "policy_ids": [
+    "so-grid-nodes_general"
+  ],
+  "output_id": null,
+  "vars": {},
+  "inputs": {
+    "filestream-filestream": {
+      "enabled": true,
+      "streams": {
+        "filestream.generic": {
+          "enabled": true,
+          "vars": {
+            "paths": [
+              "/opt/so/log/agents/agent-monitor-*.log"
+            ],
+            "data_stream.dataset": "agent-monitor",
+            "pipeline": "elasticagent.monitor",
+            "parsers": "",
+            "exclude_files": [
+              "\\.gz$"
+            ],
+            "include_files": [],
+            "processors": "- decode_json_fields:\n    fields: [\"message\"]\n    target: \"\"\n- add_fields:\n    target: event\n    fields:\n        module: gridmetrics",
+            "tags": [],
+            "recursive_glob": true,
+            "ignore_older": "72h",
+            "clean_inactive": -1,
+            "harvester_limit": 0,
+            "fingerprint": true,
+            "fingerprint_offset": 0,
+            "fingerprint_length": 1024,
+            "file_identity_native": false,
+            "exclude_lines": [],
+            "include_lines": []
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml
index e08978e0d..7e3078ccf 100644
--- a/salt/elasticsearch/defaults.yaml
+++ b/salt/elasticsearch/defaults.yaml
@@ -1243,6 +1243,70 @@ elasticsearch:
               set_priority:
                 priority: 50
             min_age: 30d
+    so-logs-agent-monitor:
+      index_sorting: false
+      index_template:
+        composed_of:
+        - event-mappings
+        - so-elastic-agent-monitor
+        - so-fleet_integrations.ip_mappings-1
+        - so-fleet_globals-1
+        - so-fleet_agent_id_verification-1
+        data_stream:
+          allow_custom_routing: false
+          hidden: false
+        ignore_missing_component_templates:
+        - logs-agent-monitor@custom
+        index_patterns:
+        - logs-agent-monitor-*
+        priority: 501
+        template:
+          mappings:
+            _meta:
+              managed: true
+              managed_by: security_onion
+              package:
+                name: elastic_agent
+          settings:
+            index:
+              lifecycle:
+                name: so-logs-agent-monitor-logs
+              mapping:
+                total_fields:
+                  limit: 5000
+              number_of_replicas: 0
+              sort:
+                field: '@timestamp'
+                order: desc
+      policy:
+        _meta:
+          managed: true
+          managed_by: security_onion
+          package:
+            name: elastic_agent
+        phases:
+          cold:
+            actions:
+              set_priority:
+                priority: 0
+            min_age: 60d
+          delete:
+            actions:
+              delete: {}
+            min_age: 365d
+          hot:
+            actions:
+              rollover:
+                max_age: 30d
+                max_primary_shard_size: 50gb
+              set_priority:
+                priority: 100
+            min_age: 0ms
+          warm:
+            actions:
+              set_priority:
+                priority: 50
+            min_age: 30d
     so-logs-elastic_agent_x_apm_server:
       index_sorting: false
       index_template:
diff --git a/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
new file mode 100644
index 000000000..50440fbed
--- /dev/null
+++ b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
@@ -0,0 +1,43 @@
+{
+  "template": {
+    "mappings": {
+      "properties": {
+        "agent": {
+          "type": "object",
+          "properties": {
+            "hostname": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "id": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "last_checkin_status": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "last_checkin": {
+              "type": "date"
+            },
+            "name": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "offline_duration_hours": {
+              "type": "integer"
+            },
+            "policy_id": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "status": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            }
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml
index 708900af6..65247d8ff 100644
--- a/salt/manager/defaults.yaml
+++ b/salt/manager/defaults.yaml
@@ -5,3 +5,10 @@ manager:
     minute: 0
   additionalCA: ''
   insecureSkipVerify: False
+  agent_monitoring:
+    enabled: False
+    config:
+      critical_agents: []
+      offline_threshold: 5
+      page_size: 250
+      run_interval: 5
diff --git a/salt/manager/init.sls b/salt/manager/init.sls
index 737d753f4..7daaeb8f2 100644
--- a/salt/manager/init.sls
+++ b/salt/manager/init.sls
@@ -34,6 +34,26 @@ agents_log_dir:
       - user
       - group
 
+agents_conf_dir:
+  file.directory:
+    - name: /opt/so/conf/agents
+    - user: root
+    - group: root
+    - recurse:
+      - user
+      - group
+
+{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %}
+critical_agents_patterns:
+  file.managed:
+    - name: /opt/so/conf/agents/critical-agents.txt
+    - contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }}
+{% else %}
+remove_critical_agents_config:
+  file.absent:
+    - name: /opt/so/conf/agents/critical-agents.txt
+{% endif %}
+
 yara_log_dir:
   file.directory:
     - name: /opt/so/log/yarasync
@@ -127,6 +147,21 @@ so_fleetagent_status:
     - month: '*'
     - dayweek: '*'
 
+so_fleetagent_monitor:
+{% if MANAGERMERGED.agent_monitoring.enabled %}
+  cron.present:
+{% else %}
+  cron.absent:
+{% endif %}
+  - name: /usr/sbin/so-elastic-agent-monitor
+  - identifier: so_fleetagent_monitor
+  - user: root
+  - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}'
+  - hour: '*'
+  - daymonth: '*'
+  - month: '*'
+  - dayweek: '*'
+
 socore_own_saltstack_default:
   file.directory:
     - name: /opt/so/saltstack/default
diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml
index cf78658de..f69f3f42a 100644
--- a/salt/manager/soc_manager.yaml
+++ b/salt/manager/soc_manager.yaml
@@ -37,3 +37,33 @@ manager:
     forcedType: bool
     global: True
     helpLink: proxy.html
+  agent_monitoring:
+    enabled:
+      description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold.
+      global: True
+      helpLink: elastic-fleet.html
+      forcedType: bool
+    config:
+      critical_agents:
+        description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold
+        global: True
+        multiline: True
+        helpLink: elastic-fleet.html
+        forcedType: "[]string"
+      offline_threshold:
+        description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+      page_size:
+        description: The amount of agents that can be processed per API request to fleet.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+        advanced: True
+      run_interval:
+        description: The time in minutes between checking fleet agent statuses.
+        global: True
+        advanced: True
+        helpLink: elastic-fleet.html
+        forcedType: int
diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
new file mode 100644
index 000000000..572d4de4d
--- /dev/null
+++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+{% from 'manager/map.jinja' import MANAGERMERGED %}
+{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %}
+{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %}
+
+set -euo pipefail
+
+LOG_DIR="/opt/so/log/agents"
+LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
+CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
+FLEET_API="http://localhost:5601/api/fleet/agents"
+CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
+
+OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
+PAGE_SIZE="{{ PAGE_SIZE }}"
+
+log_message() {
+    local level="$1"
+    local message="$2"
+    echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
+}
+
+matches_critical_pattern() {
+    local hostname="$1"
+    local pattern_file="$2"
+    
+    # If critical agents file doesn't exist or is empty, match all
+    if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
+        return 0
+    fi
+    
+    local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
+    
+    while IFS= read -r pattern || [ -n "$pattern" ]; do
+        # empty lines and comments
+        [[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
+        
+        # cut whitespace
+        pattern=$(echo "$pattern" | xargs)
+        
+        local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
+        
+        # Replace * with bash wildcard
+        local bash_pattern="${pattern_lower//\*/.*}"
+        
+        # Check if hostname matches the pattern
+        if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
+            return 0
+        fi
+    done < "$pattern_file"
+    
+    return 1
+}
+
+calculate_offline_hours() {
+    local last_checkin="$1"
+    local current_time=$(date +%s)
+    local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
+    
+    if [ "$checkin_time" -eq "0" ]; then
+        echo "0"
+        return
+    fi
+    
+    local diff=$((current_time - checkin_time))
+    echo $((diff / 3600))
+}
+
+cleanup_old_logs() {
+    # Find and delete log files older than 7 days
+    local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null)
+
+    if [ -n "$old_files" ]; then
+        local deleted_count=$(echo "$old_files" | wc -l)
+        echo "$old_files" | xargs rm -f
+        log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)"
+    fi
+}
+
+main() {
+    log_message "INFO" "Starting Fleet agent status check"
+    
+    # Check if critical agents file is configured
+    if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
+        log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
+        log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
+    else
+        log_message "INFO" "No critical agents filter found, monitoring all agents"
+    fi
+    
+    cleanup_old_logs
+
+    log_message "INFO" "Querying Fleet API"
+
+    local page=1
+    local total_agents=0
+    local processed_agents=0
+    local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+
+    while true; do
+        log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
+
+        if ! response_body=$(curl -K "$CURL_CONFIG" \
+            -s --fail \
+            "${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \
+            -H 'kbn-xsrf: true' 2>/dev/null); then
+            log_message "ERROR" "Failed to query Fleet API (page $page)"
+            exit 1
+        fi
+
+        # pagination info
+        current_total=$(echo "$response_body" | jq -r '.total // 0')
+        current_page=$(echo "$response_body" | jq -r '.page // 1')
+        agents_in_page=$(echo "$response_body" | jq -r '.list | length')
+
+        # Update total
+        if [ "$page" -eq 1 ]; then
+            total_agents="$current_total"
+            log_message "INFO" "Found $total_agents total agents across all pages"
+        fi
+
+        log_message "INFO" "Processing page $current_page with $agents_in_page agents"
+
+        # Process agents from current page
+        echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do
+        # Grab agent details
+        agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
+        agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
+        agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
+        agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
+        last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
+        last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
+        policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
+
+        # Only log agents that are offline or degraded (skip inactive agents)
+	  # Fleetserver agents can show multiple versions as 'inactive'
+        if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
+            # Check if agent matches critical agent patterns (if configured)
+            if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
+                continue  # Skip this agent if it doesn't match any critical agent pattern
+            fi
+            
+            offline_hours=$(calculate_offline_hours "$last_checkin")
+            
+            log_entry=$(jq -c \
+                --arg ts "$current_timestamp" \
+                --arg id "$agent_id" \
+                --arg hostname "$agent_hostname" \
+                --arg name "$agent_name" \
+                --arg status "$agent_status" \
+                --arg last_checkin "$last_checkin" \
+                --arg last_checkin_status "$last_checkin_status" \
+                --arg policy_id "$policy_id" \
+                --arg offline_hours "$offline_hours" \
+                '{
+                    "@timestamp": $ts,
+                    "agent.id": $id,
+                    "agent.hostname": $hostname,
+                    "agent.name": $name,
+                    "agent.status": $status,
+                    "agent.last_checkin": $last_checkin,
+                    "agent.last_checkin_status": $last_checkin_status,
+                    "agent.policy_id": $policy_id,
+                    "agent.offline_duration_hours": ($offline_hours | tonumber)
+                }')
+            
+            echo "$log_entry" >> "$LOG_FILE"
+            
+            log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
+        fi
+        done
+
+        processed_agents=$((processed_agents + agents_in_page))
+
+        if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
+            log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
+            break
+        fi
+
+        page=$((page + 1))
+
+	# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
+        if [ "$page" -gt 100 ]; then
+            log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
+            break
+        fi
+    done
+    
+    log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
+}
+
+main "$@"

From dfec29d18e7981efd66df8e5d639bc4c6c1c7b80 Mon Sep 17 00:00:00 2001
From: reyesj2 <94730068+reyesj2@users.noreply.github.com>
Date: Thu, 4 Sep 2025 15:37:28 -0500
Subject: [PATCH 2/4] custom kquery

---
 salt/elasticfleet/defaults.yaml               |   1 +
 .../files/ingest/elasticagent.monitor         |  36 ++++++
 salt/manager/defaults.yaml                    |   1 +
 salt/manager/soc_manager.yaml                 |   8 +-
 .../tools/sbin_jinja/so-elastic-agent-monitor | 121 ++++++++++--------
 5 files changed, 113 insertions(+), 54 deletions(-)
 create mode 100644 salt/elasticsearch/files/ingest/elasticagent.monitor

diff --git a/salt/elasticfleet/defaults.yaml b/salt/elasticfleet/defaults.yaml
index d6cdd7351..0220428bf 100644
--- a/salt/elasticfleet/defaults.yaml
+++ b/salt/elasticfleet/defaults.yaml
@@ -38,6 +38,7 @@ elasticfleet:
     - elasticsearch
     - endpoint
     - fleet_server
+    - filestream
     - http_endpoint
     - httpjson
     - log
diff --git a/salt/elasticsearch/files/ingest/elasticagent.monitor b/salt/elasticsearch/files/ingest/elasticagent.monitor
new file mode 100644
index 000000000..09d8297c4
--- /dev/null
+++ b/salt/elasticsearch/files/ingest/elasticagent.monitor
@@ -0,0 +1,36 @@
+{
+  "processors": [
+    {
+      "set": {
+        "field": "event.dataset",
+        "value": "gridmetrics.agents",
+        "ignore_failure": true
+      }
+    },
+    {
+      "set": {
+        "field": "event.module",
+        "value": "gridmetrics",
+        "ignore_failure": true
+      }
+    },
+    {
+      "remove": {
+        "field": [
+          "host",
+          "elastic_agent",
+          "agent"
+        ],
+        "ignore_missing": true,
+        "ignore_failure": true
+      }
+    },
+    {
+      "json": {
+        "field": "message",
+        "add_to_root": true,
+        "ignore_failure": true
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml
index 65247d8ff..237ac2999 100644
--- a/salt/manager/defaults.yaml
+++ b/salt/manager/defaults.yaml
@@ -9,6 +9,7 @@ manager:
     enabled: False
     config:
       critical_agents: []
+      custom_kquery:
       offline_threshold: 5
       page_size: 250
       run_interval: 5
diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml
index f69f3f42a..ac06ac2b4 100644
--- a/salt/manager/soc_manager.yaml
+++ b/salt/manager/soc_manager.yaml
@@ -45,11 +45,17 @@ manager:
       forcedType: bool
     config:
       critical_agents:
-        description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold
+        description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold.
         global: True
         multiline: True
         helpLink: elastic-fleet.html
         forcedType: "[]string"
+      custom_kquery:
+        description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA'
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: string
+        advanced: True
       offline_threshold:
         description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
         global: True
diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
index 572d4de4d..0f3bcac34 100644
--- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
+++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
@@ -1,17 +1,21 @@
+{%- from 'manager/map.jinja' import MANAGERMERGED -%}
+{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
+{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
+{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
 #!/bin/bash
 
-{% from 'manager/map.jinja' import MANAGERMERGED %}
-{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %}
-{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %}
-
 set -euo pipefail
 
 LOG_DIR="/opt/so/log/agents"
 LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
 CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
 FLEET_API="http://localhost:5601/api/fleet/agents"
+{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
+{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
+CRITICAL_AGENTS_FILE="/dev/null"
+{%- else %}
 CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
-
+{%- endif %}
 OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
 PAGE_SIZE="{{ PAGE_SIZE }}"
 
@@ -80,7 +84,7 @@ cleanup_old_logs() {
 
 main() {
     log_message "INFO" "Starting Fleet agent status check"
-    
+
     # Check if critical agents file is configured
     if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
         log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
@@ -98,12 +102,20 @@ main() {
     local processed_agents=0
     local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 
+    {%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
+    log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}"
+    FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}"
+    {%- else %}
+    log_message "INFO" "Using default query (all offline or degraded agents)"
+    FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}"
+    {%- endif %}
+
     while true; do
         log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
 
         if ! response_body=$(curl -K "$CURL_CONFIG" \
             -s --fail \
-            "${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \
+            $FLEET_QUERY \
             -H 'kbn-xsrf: true' 2>/dev/null); then
             log_message "ERROR" "Failed to query Fleet API (page $page)"
             exit 1
@@ -123,52 +135,55 @@ main() {
         log_message "INFO" "Processing page $current_page with $agents_in_page agents"
 
         # Process agents from current page
-        echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do
-        # Grab agent details
-        agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
-        agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
-        agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
-        agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
-        last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
-        last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
-        policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
+        mapfile -t agents < <(echo "$response_body" | jq -c '.list[]')
 
-        # Only log agents that are offline or degraded (skip inactive agents)
-	  # Fleetserver agents can show multiple versions as 'inactive'
-        if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
-            # Check if agent matches critical agent patterns (if configured)
-            if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
-                continue  # Skip this agent if it doesn't match any critical agent pattern
+        for agent in "${agents[@]}"; do
+            # Grab agent details
+            agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
+            agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
+            agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
+            agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
+            last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
+            last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
+            policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
+
+            # Only log agents that are offline or degraded (skip inactive agents)
+            # Fleetserver agents can show multiple versions as 'inactive'
+            if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
+                # Check if agent matches critical agent patterns (if configured)
+                if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
+                    log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent"
+                    continue  # Skip this agent if it doesn't match any critical agent pattern
+                fi
+
+                offline_hours=$(calculate_offline_hours "$last_checkin")
+
+                log_entry=$(echo 'null' | jq -c \
+                    --arg ts "$current_timestamp" \
+                    --arg id "$agent_id" \
+                    --arg hostname "$agent_hostname" \
+                    --arg name "$agent_name" \
+                    --arg status "$agent_status" \
+                    --arg last_checkin "$last_checkin" \
+                    --arg last_checkin_status "$last_checkin_status" \
+                    --arg policy_id "$policy_id" \
+                    --arg offline_hours "$offline_hours" \
+                    '{
+                        "@timestamp": $ts,
+                        "agent.id": $id,
+                        "agent.hostname": $hostname,
+                        "agent.name": $name,
+                        "agent.status": $status,
+                        "agent.last_checkin": $last_checkin,
+                        "agent.last_checkin_status": $last_checkin_status,
+                        "agent.policy_id": $policy_id,
+                        "agent.offline_duration_hours": ($offline_hours | tonumber)
+                    }')
+
+                echo "$log_entry" >> "$LOG_FILE"
+
+                log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
             fi
-            
-            offline_hours=$(calculate_offline_hours "$last_checkin")
-            
-            log_entry=$(jq -c \
-                --arg ts "$current_timestamp" \
-                --arg id "$agent_id" \
-                --arg hostname "$agent_hostname" \
-                --arg name "$agent_name" \
-                --arg status "$agent_status" \
-                --arg last_checkin "$last_checkin" \
-                --arg last_checkin_status "$last_checkin_status" \
-                --arg policy_id "$policy_id" \
-                --arg offline_hours "$offline_hours" \
-                '{
-                    "@timestamp": $ts,
-                    "agent.id": $id,
-                    "agent.hostname": $hostname,
-                    "agent.name": $name,
-                    "agent.status": $status,
-                    "agent.last_checkin": $last_checkin,
-                    "agent.last_checkin_status": $last_checkin_status,
-                    "agent.policy_id": $policy_id,
-                    "agent.offline_duration_hours": ($offline_hours | tonumber)
-                }')
-            
-            echo "$log_entry" >> "$LOG_FILE"
-            
-            log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
-        fi
         done
 
         processed_agents=$((processed_agents + agents_in_page))
@@ -180,13 +195,13 @@ main() {
 
         page=$((page + 1))
 
-	# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
+        # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
         if [ "$page" -gt 100 ]; then
             log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
             break
         fi
     done
-    
+
     log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
 }
 

From 915b9e7bd7cc8c8a6aa057a6fe084bc574aa1f91 Mon Sep 17 00:00:00 2001
From: reyesj2 <94730068+reyesj2@users.noreply.github.com>
Date: Fri, 5 Sep 2025 09:22:44 -0500
Subject: [PATCH 3/4] use logrotate

---
 salt/logrotate/defaults.yaml                    |  9 +++++++++
 salt/logrotate/soc_logrotate.yaml               |  7 +++++++
 .../tools/sbin_jinja/so-elastic-agent-monitor   | 17 ++---------------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/salt/logrotate/defaults.yaml b/salt/logrotate/defaults.yaml
index 2f7247ff2..479b598f5 100644
--- a/salt/logrotate/defaults.yaml
+++ b/salt/logrotate/defaults.yaml
@@ -268,3 +268,12 @@ logrotate:
       - nocompress
       - create
       - sharedscripts
+    /opt/so/log/agents/agent-monitor*_x_log:
+      - daily
+      - rotate 14
+      - missingok
+      - compress
+      - create
+      - extension .log
+      - dateext
+      - dateyesterday
\ No newline at end of file
diff --git a/salt/logrotate/soc_logrotate.yaml b/salt/logrotate/soc_logrotate.yaml
index 56f879e4f..6f0272ef0 100644
--- a/salt/logrotate/soc_logrotate.yaml
+++ b/salt/logrotate/soc_logrotate.yaml
@@ -175,3 +175,10 @@ logrotate:
       multiline: True
       global: True
       forcedType: "[]string"
+    "/opt/so/log/agents/agent-monitor*_x_log":
+      description: List of logrotate options for this file.
+      title: /opt/so/log/agents/agent-monitor*.log
+      advanced: True
+      multiline: True
+      global: True
+      forcedType: "[]string"
diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
index 0f3bcac34..0b40925fd 100644
--- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
+++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
@@ -7,7 +7,7 @@
 set -euo pipefail
 
 LOG_DIR="/opt/so/log/agents"
-LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log"
+LOG_FILE="$LOG_DIR/agent-monitor.log"
 CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
 FLEET_API="http://localhost:5601/api/fleet/agents"
 {#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
@@ -71,17 +71,6 @@ calculate_offline_hours() {
     echo $((diff / 3600))
 }
 
-cleanup_old_logs() {
-    # Find and delete log files older than 7 days
-    local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null)
-
-    if [ -n "$old_files" ]; then
-        local deleted_count=$(echo "$old_files" | wc -l)
-        echo "$old_files" | xargs rm -f
-        log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)"
-    fi
-}
-
 main() {
     log_message "INFO" "Starting Fleet agent status check"
 
@@ -92,8 +81,6 @@ main() {
     else
         log_message "INFO" "No critical agents filter found, monitoring all agents"
     fi
-    
-    cleanup_old_logs
 
     log_message "INFO" "Querying Fleet API"
 
@@ -115,7 +102,7 @@ main() {
 
         if ! response_body=$(curl -K "$CURL_CONFIG" \
             -s --fail \
-            $FLEET_QUERY \
+            "$FLEET_QUERY" \
             -H 'kbn-xsrf: true' 2>/dev/null); then
             log_message "ERROR" "Failed to query Fleet API (page $page)"
             exit 1

From 348f9dcaec75bcdbbdfa47dd7d2a86f4d73316df Mon Sep 17 00:00:00 2001
From: reyesj2 <94730068+reyesj2@users.noreply.github.com>
Date: Fri, 5 Sep 2025 10:01:24 -0500
Subject: [PATCH 4/4] prevent multiple script instances using file lock

---
 salt/manager/init.sls | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/salt/manager/init.sls b/salt/manager/init.sls
index 7daaeb8f2..f59c33652 100644
--- a/salt/manager/init.sls
+++ b/salt/manager/init.sls
@@ -153,7 +153,7 @@ so_fleetagent_monitor:
 {% else %}
   cron.absent:
 {% endif %}
-  - name: /usr/sbin/so-elastic-agent-monitor
+  - name: /bin/flock -n /opt/so/log/agents/agent-monitor.lock /usr/sbin/so-elastic-agent-monitor
   - identifier: so_fleetagent_monitor
   - user: root
   - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}'