Merge pull request #15015 from Security-Onion-Solutions/vlb2

Vlb2
2026-01-23 16:33:29 +01:00 · 2025-09-10 14:58:41 -04:00
parent 77fef02116 e6eecc93c8
commit 03892bad5e
25 changed files with 882 additions and 63 deletions
--- a/salt/elasticfleet/defaults.yaml
+++ b/salt/elasticfleet/defaults.yaml
@@ -38,6 +38,7 @@ elasticfleet:
    - elasticsearch
    - endpoint
    - fleet_server
+    - filestream
    - http_endpoint
    - httpjson
    - log
--- a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
+++ b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json
@@ -0,0 +1,48 @@
+{
+  "package": {
+    "name": "filestream",
+    "version": ""
+  },
+  "name": "agent-monitor",
+  "namespace": "",
+  "description": "",
+  "policy_ids": [
+    "so-grid-nodes_general"
+  ],
+  "output_id": null,
+  "vars": {},
+  "inputs": {
+    "filestream-filestream": {
+      "enabled": true,
+      "streams": {
+        "filestream.generic": {
+          "enabled": true,
+          "vars": {
+            "paths": [
+              "/opt/so/log/agents/agent-monitor.log"
+            ],
+            "data_stream.dataset": "agent-monitor",
+            "pipeline": "elasticagent.monitor",
+            "parsers": "",
+            "exclude_files": [
+              "\\.gz$"
+            ],
+            "include_files": [],
+            "processors": "- decode_json_fields:\n    fields: [\"message\"]\n    target: \"\"\n- add_fields:\n    target: event\n    fields:\n        module: gridmetrics",
+            "tags": [],
+            "recursive_glob": true,
+            "ignore_older": "72h",
+            "clean_inactive": -1,
+            "harvester_limit": 0,
+            "fingerprint": true,
+            "fingerprint_offset": 0,
+            "fingerprint_length": 1024,
+            "file_identity_native": false,
+            "exclude_lines": [],
+            "include_lines": []
+          }
+        }
+      }
+    }
+  }
+}
--- a/salt/elasticsearch/defaults.yaml
+++ b/salt/elasticsearch/defaults.yaml
@@ -284,6 +284,86 @@ elasticsearch:
          hot:
            actions: {}
            min_age: 0ms
+    so-assistant-chat:
+      index_sorting: false
+      index_template:
+        composed_of:
+        - assistant-chat-mappings
+        - assistant-chat-settings
+        data_stream:
+          allow_custom_routing: false
+          hidden: false
+        ignore_missing_component_templates: []
+        index_patterns:
+        - so-assistant-chat*
+        priority: 501
+        template:
+          mappings:
+            date_detection: false
+            dynamic_templates:
+            - strings_as_keyword:
+                mapping:
+                  ignore_above: 1024
+                  type: keyword
+                match_mapping_type: string
+          settings:
+            index:
+              lifecycle:
+                name: so-assistant-chat-logs
+              mapping:
+                total_fields:
+                  limit: 1500
+              number_of_replicas: 0
+              number_of_shards: 1
+              refresh_interval: 1s
+              sort:
+                field: '@timestamp'
+                order: desc
+      policy:
+        phases:
+          hot:
+            actions: {}
+            min_age: 0ms
+    so-assistant-session:
+      index_sorting: false
+      index_template:
+        composed_of:
+        - assistant-session-mappings
+        - assistant-session-settings
+        data_stream:
+          allow_custom_routing: false
+          hidden: false
+        ignore_missing_component_templates: []
+        index_patterns:
+        - so-assistant-session*
+        priority: 501
+        template:
+          mappings:
+            date_detection: false
+            dynamic_templates:
+            - strings_as_keyword:
+                mapping:
+                  ignore_above: 1024
+                  type: keyword
+                match_mapping_type: string
+          settings:
+            index:
+              lifecycle:
+                name: so-assistant-session-logs
+              mapping:
+                total_fields:
+                  limit: 1500
+              number_of_replicas: 0
+              number_of_shards: 1
+              refresh_interval: 1s
+              sort:
+                field: '@timestamp'
+                order: desc
+      policy:
+        phases:
+          hot:
+            actions: {}
+            min_age: 0ms
    so-endgame:
      index_sorting: false
      index_template:
@@ -1243,6 +1323,70 @@ elasticsearch:
              set_priority:
                priority: 50
            min_age: 30d
+    so-logs-agent-monitor:
+      index_sorting: false
+      index_template:
+        composed_of:
+        - event-mappings
+        - so-elastic-agent-monitor
+        - so-fleet_integrations.ip_mappings-1
+        - so-fleet_globals-1
+        - so-fleet_agent_id_verification-1
+        data_stream:
+          allow_custom_routing: false
+          hidden: false
+        ignore_missing_component_templates:
+        - logs-agent-monitor@custom
+        index_patterns:
+        - logs-agent-monitor-*
+        priority: 501
+        template:
+          mappings:
+            _meta:
+              managed: true
+              managed_by: security_onion
+              package:
+                name: elastic_agent
+          settings:
+            index:
+              lifecycle:
+                name: so-logs-agent-monitor-logs
+              mapping:
+                total_fields:
+                  limit: 5000
+              number_of_replicas: 0
+              sort:
+                field: '@timestamp'
+                order: desc
+      policy:
+        _meta:
+          managed: true
+          managed_by: security_onion
+          package:
+            name: elastic_agent
+        phases:
+          cold:
+            actions:
+              set_priority:
+                priority: 0
+            min_age: 60d
+          delete:
+            actions:
+              delete: {}
+            min_age: 365d
+          hot:
+            actions:
+              rollover:
+                max_age: 30d
+                max_primary_shard_size: 50gb
+              set_priority:
+                priority: 100
+            min_age: 0ms
+          warm:
+            actions:
+              set_priority:
+                priority: 50
+            min_age: 30d
    so-logs-elastic_agent_x_apm_server:
      index_sorting: false
      index_template:
--- a/salt/elasticsearch/files/ingest/elasticagent.monitor
+++ b/salt/elasticsearch/files/ingest/elasticagent.monitor
@@ -0,0 +1,36 @@
+{
+  "processors": [
+    {
+      "set": {
+        "field": "event.dataset",
+        "value": "gridmetrics.agents",
+        "ignore_failure": true
+      }
+    },
+    {
+      "set": {
+        "field": "event.module",
+        "value": "gridmetrics",
+        "ignore_failure": true
+      }
+    },
+    {
+      "remove": {
+        "field": [
+          "host",
+          "elastic_agent",
+          "agent"
+        ],
+        "ignore_missing": true,
+        "ignore_failure": true
+      }
+    },
+    {
+      "json": {
+        "field": "message",
+        "add_to_root": true,
+        "ignore_failure": true
+      }
+    }
+  ]
+}
--- a/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
+++ b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json
@@ -0,0 +1,43 @@
+{
+  "template": {
+    "mappings": {
+      "properties": {
+        "agent": {
+          "type": "object",
+          "properties": {
+            "hostname": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "id": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "last_checkin_status": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "last_checkin": {
+              "type": "date"
+            },
+            "name": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "offline_duration_hours": {
+              "type": "integer"
+            },
+            "policy_id": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            },
+            "status": {
+              "ignore_above": 1024,
+              "type": "keyword"
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/salt/elasticsearch/templates/component/so/assistant-chat-mappings.json
+++ b/salt/elasticsearch/templates/component/so/assistant-chat-mappings.json
@@ -0,0 +1,104 @@
+{
+	"template": {
+		"mappings": {
+			"properties": {
+				"@timestamp": {
+					"type": "date"
+				},
+				"so_kind": {
+					"ignore_above": 1024,
+					"type": "keyword"
+				},
+				"so_operation": {
+					"ignore_above": 1024,
+					"type": "keyword"
+				},
+				"so_chat": {
+					"properties": {
+						"role": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"content": {
+							"type": "object",
+							"enabled": false
+						},
+						"sessionId": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"createTime": {
+							"type": "date"
+						},
+						"deletedAt": {
+							"type": "date"
+						},
+						"tags": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"tool_use_id": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"userId": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"message": {
+							"properties": {
+								"id": {
+									"ignore_above": 1024,
+									"type": "keyword"
+								},
+								"type": {
+									"ignore_above": 1024,
+									"type": "keyword"
+								},
+								"role": {
+									"ignore_above": 1024,
+									"type": "keyword"
+								},
+								"model": {
+									"ignore_above": 1024,
+									"type": "keyword"
+								},
+								"contentStr": {
+									"type": "text"
+								},
+								"contentBlocks": {
+									"type": "nested",
+									"enabled": false
+								},
+								"stopReason": {
+									"ignore_above": 1024,
+									"type": "keyword"
+								},
+								"stopSequence": {
+									"ignore_above": 1024,
+									"type": "keyword"
+								},
+								"usage": {
+									"properties": {
+										"input_tokens": {
+											"type": "long"
+										},
+										"output_tokens": {
+											"type": "long"
+										},
+										"credits": {
+											"type": "long"
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	},
+	"_meta": {
+		"ecs_version": "1.12.2"
+	}
+}
--- a/salt/elasticsearch/templates/component/so/assistant-chat-settings.json
+++ b/salt/elasticsearch/templates/component/so/assistant-chat-settings.json
@@ -0,0 +1,7 @@
+{
+	"template": {},
+	"version": 1,
+	"_meta": {
+		"description": "default settings for common Security Onion Assistant indices"
+	}
+}
--- a/salt/elasticsearch/templates/component/so/assistant-session-mappings.json
+++ b/salt/elasticsearch/templates/component/so/assistant-session-mappings.json
@@ -0,0 +1,44 @@
+{
+	"template": {
+		"mappings": {
+			"properties": {
+				"@timestamp": {
+					"type": "date"
+				},
+				"so_kind": {
+					"ignore_above": 1024,
+					"type": "keyword"
+				},
+				"so_session": {
+					"properties": {
+						"title": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"sessionId": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"createTime": {
+							"type": "date"
+						},
+						"deleteTime": {
+							"type": "date"
+						},
+						"tags": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						},
+						"userId": {
+							"ignore_above": 1024,
+							"type": "keyword"
+						}
+					}
+				}
+			}
+		}
+	},
+	"_meta": {
+		"ecs_version": "1.12.2"
+	}
+}
--- a/salt/elasticsearch/templates/component/so/assistant-session-settings.json
+++ b/salt/elasticsearch/templates/component/so/assistant-session-settings.json
@@ -0,0 +1,7 @@
+{
+	"template": {},
+	"version": 1,
+	"_meta": {
+		"description": "default settings for common Security Onion Assistant indices"
+	}
+}
--- a/salt/firewall/defaults.yaml
+++ b/salt/firewall/defaults.yaml
@@ -1230,6 +1230,10 @@ firewall:
              portgroups:
                - elasticsearch_node
                - elasticsearch_rest
+            managerhype:
+              portgroups:
+                - elasticsearch_node
+                - elasticsearch_rest
            standalone:
              portgroups:
                - elasticsearch_node
@@ -1377,6 +1381,10 @@ firewall:
              portgroups:
                - elasticsearch_node
                - elasticsearch_rest
+            managerhype:
+              portgroups:
+                - elasticsearch_node
+                - elasticsearch_rest
            standalone:
              portgroups:
                - elasticsearch_node
@@ -1579,6 +1587,9 @@ firewall:
              portgroups:
                - redis
                - elastic_agent_data
+            managerhype:
+              portgroups:
+                - elastic_agent_data
            self:
              portgroups:
                - redis
@@ -1696,6 +1707,9 @@ firewall:
            managersearch:
              portgroups:
                - openssh
+            managerhype:
+              portgroups:
+                - openssh
            standalone:
              portgroups:
                - openssh
@@ -1758,6 +1772,8 @@ firewall:
              portgroups: []
            managersearch:
              portgroups: []
+            managerhype:
+              portgroups: []
            standalone:
              portgroups: []
            customhostgroup0:
--- a/salt/firewall/map.jinja
+++ b/salt/firewall/map.jinja
@@ -25,7 +25,7 @@
 {%   set KAFKA_EXTERNAL_ACCESS = salt['pillar.get']('kafka:config:external_access:enabled', default=False) %}
 {%   set kafka_node_type = salt['pillar.get']('kafka:nodes:'+ GLOBALS.hostname + ':role') %}

-{%   if role in ['manager', 'managersearch', 'standalone'] %}
+{%   if role.startswith('manager') or role == 'standalone' %}
 {%     do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[role].portgroups.append('kafka_controller') %}
 {%     do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.receiver.portgroups.append('kafka_controller') %}
 {%   endif %}
@@ -38,8 +38,8 @@
 {%     do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.receiver.portgroups.append('kafka_controller') %}
 {%   endif %}

-{%   if role in ['manager', 'managersearch', 'standalone', 'receiver'] %}
-{%     for r in ['manager', 'managersearch', 'standalone', 'receiver', 'fleet', 'idh', 'sensor', 'searchnode','heavynode', 'elastic_agent_endpoint', 'desktop'] %}
+{%   if role.startswith('manager') or role in ['standalone', 'receiver'] %}
+{%     for r in ['manager', 'managersearch', 'managerhype', 'standalone', 'receiver', 'fleet', 'idh', 'sensor', 'searchnode','heavynode', 'elastic_agent_endpoint', 'desktop'] %}
 {%       if FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r] is defined %}
 {%         do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r].portgroups.append('kafka_data') %}
 {%       endif %}
@@ -48,11 +48,11 @@

 {%   if KAFKA_EXTERNAL_ACCESS %}
 {#     Kafka external access only applies for Kafka nodes with the broker role. #}
-{%     if role in ['manager', 'managersearch', 'standalone', 'receiver'] and 'broker' in kafka_node_type %}
+{%     if role.startswith('manager') or role in ['standalone', 'receiver'] and 'broker' in kafka_node_type %}
 {%       do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.external_kafka.portgroups.append('kafka_external_access') %}
 {%     endif %}
 {%   endif %}

 {% endif %}

-{% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %}
+{% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %}
--- a/salt/logrotate/defaults.yaml
+++ b/salt/logrotate/defaults.yaml
@@ -268,3 +268,12 @@ logrotate:
      - nocompress
      - create
      - sharedscripts
+    /opt/so/log/agents/agent-monitor*_x_log:
+      - daily
+      - rotate 14
+      - missingok
+      - compress
+      - create
+      - extension .log
+      - dateext
+      - dateyesterday
--- a/salt/logrotate/soc_logrotate.yaml
+++ b/salt/logrotate/soc_logrotate.yaml
@@ -175,3 +175,10 @@ logrotate:
      multiline: True
      global: True
      forcedType: "[]string"
+    "/opt/so/log/agents/agent-monitor*_x_log":
+      description: List of logrotate options for this file.
+      title: /opt/so/log/agents/agent-monitor*.log
+      advanced: True
+      multiline: True
+      global: True
+      forcedType: "[]string"
--- a/salt/logstash/map.jinja
+++ b/salt/logstash/map.jinja
@@ -17,7 +17,7 @@

 {% for node_type, node_details in redis_node_data.items() | sort %}
 {%   if GLOBALS.role in ['so-searchnode', 'so-standalone', 'so-managersearch', 'so-fleet'] %}
-{%     if node_type in ['manager', 'managersearch', 'standalone', 'receiver' ] %}
+{%     if node_type.startswith('manager') or node_type in ['standalone', 'receiver'] %}
 {%       for hostname in redis_node_data[node_type].keys() %}
 {%         do LOGSTASH_REDIS_NODES.append({hostname:node_details[hostname].ip}) %}
 {%       endfor %}
@@ -47,7 +47,7 @@
 {%   endif %}
 {# Disable logstash on manager & receiver nodes unless it has an override configured #}
 {%   if not KAFKA_LOGSTASH %}
-{%     if GLOBALS.role in ['so-manager', 'so-receiver'] and GLOBALS.hostname not in KAFKA_LOGSTASH %}
+{%     if GLOBALS.role in ['so-manager', 'so-managerhype', 'so-receiver'] and GLOBALS.hostname not in KAFKA_LOGSTASH %}
 {%       do LOGSTASH_MERGED.update({'enabled': False}) %}
 {%     endif %}
 {%   endif %}
--- a/salt/manager/defaults.yaml
+++ b/salt/manager/defaults.yaml
@@ -5,3 +5,12 @@ manager:
    minute: 0
  additionalCA: ''
  insecureSkipVerify: False
+  agent_monitoring:
+    enabled: False
+    config:
+      critical_agents: []
+      custom_kquery:
+      offline_threshold: 5
+      realert_threshold: 5
+      page_size: 250
+      run_interval: 5
--- a/salt/manager/init.sls
+++ b/salt/manager/init.sls
@@ -34,6 +34,26 @@ agents_log_dir:
      - user
      - group

+agents_conf_dir:
+  file.directory:
+    - name: /opt/so/conf/agents
+    - user: root
+    - group: root
+    - recurse:
+      - user
+      - group
+
+{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %}
+critical_agents_patterns:
+  file.managed:
+    - name: /opt/so/conf/agents/critical-agents.txt
+    - contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }}
+{% else %}
+remove_critical_agents_config:
+  file.absent:
+    - name: /opt/so/conf/agents/critical-agents.txt
+{% endif %}
+
 yara_log_dir:
  file.directory:
    - name: /opt/so/log/yarasync
@@ -127,6 +147,21 @@ so_fleetagent_status:
    - month: '*'
    - dayweek: '*'

+so_fleetagent_monitor:
+{% if MANAGERMERGED.agent_monitoring.enabled %}
+  cron.present:
+{% else %}
+  cron.absent:
+{% endif %}
+  - name: /bin/flock -n /opt/so/log/agents/agent-monitor.lock /usr/sbin/so-elastic-agent-monitor
+  - identifier: so_fleetagent_monitor
+  - user: root
+  - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}'
+  - hour: '*'
+  - daymonth: '*'
+  - month: '*'
+  - dayweek: '*'
+
 socore_own_saltstack_default:
  file.directory:
    - name: /opt/so/saltstack/default
--- a/salt/manager/soc_manager.yaml
+++ b/salt/manager/soc_manager.yaml
@@ -37,3 +37,44 @@ manager:
    forcedType: bool
    global: True
    helpLink: proxy.html
+  agent_monitoring:
+    enabled:
+      description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold.
+      global: True
+      helpLink: elastic-fleet.html
+      forcedType: bool
+    config:
+      critical_agents:
+        description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold.
+        global: True
+        multiline: True
+        helpLink: elastic-fleet.html
+        forcedType: "[]string"
+      custom_kquery:
+        description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA'
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: string
+        advanced: True
+      offline_threshold:
+        description: The maximum allowed time in hours a 'critical' agent has been offline before being logged.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+      realert_threshold:
+        description: The time to pass before another alert for an offline agent exceeding the offline_threshold is generated.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+      page_size:
+        description: The amount of agents that can be processed per API request to fleet.
+        global: True
+        helpLink: elastic-fleet.html
+        forcedType: int
+        advanced: True
+      run_interval:
+        description: The time in minutes between checking fleet agent statuses.
+        global: True
+        advanced: True
+        helpLink: elastic-fleet.html
+        forcedType: int
--- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
+++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
@@ -0,0 +1,254 @@
+{%- from 'manager/map.jinja' import MANAGERMERGED -%}
+{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
+{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
+{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
+{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%}
+#!/bin/bash
+
+set -euo pipefail
+
+LOG_DIR="/opt/so/log/agents"
+LOG_FILE="$LOG_DIR/agent-monitor.log"
+CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
+FLEET_API="http://localhost:5601/api/fleet/agents"
+{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
+{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
+CRITICAL_AGENTS_FILE="/dev/null"
+{%- else %}
+CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
+{%- endif %}
+OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
+REALERT_THRESHOLD={{ REALERT_THRESHOLD }}
+PAGE_SIZE="{{ PAGE_SIZE }}"
+
+log_message() {
+    local level="$1"
+    local message="$2"
+    echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
+}
+
+matches_critical_pattern() {
+    local hostname="$1"
+    local pattern_file="$2"
+    
+    # If critical agents file doesn't exist or is empty, match all
+    if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
+        return 0
+    fi
+    
+    local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
+    
+    while IFS= read -r pattern || [ -n "$pattern" ]; do
+        # empty lines and comments
+        [[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
+        
+        # cut whitespace
+        pattern=$(echo "$pattern" | xargs)
+        
+        local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
+        
+        # Replace * with bash wildcard
+        local bash_pattern="${pattern_lower//\*/.*}"
+        
+        # Check if hostname matches the pattern
+        if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
+            return 0
+        fi
+    done < "$pattern_file"
+    
+    return 1
+}
+
+calculate_offline_hours() {
+    local last_checkin="$1"
+    local current_time=$(date +%s)
+    local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
+    
+    if [ "$checkin_time" -eq "0" ]; then
+        echo "0"
+        return
+    fi
+    
+    local diff=$((current_time - checkin_time))
+    echo $((diff / 3600))
+}
+
+check_recent_log_entries() {
+    local agent_hostname="$1"
+
+    if [ ! -f "$LOG_FILE" ]; then
+        return 1
+    fi
+
+    local current_time=$(date +%s)
+    local threshold_seconds=$((REALERT_THRESHOLD * 3600))
+    local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]')
+    local most_recent_timestamp=""
+
+    while IFS= read -r line; do
+        [ -z "$line" ] && continue
+
+        local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null)
+        local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null)
+
+        [ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue
+
+        local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]')
+
+        if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then
+            most_recent_timestamp="$logged_timestamp"
+        fi
+    done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null)
+
+    # If there is agent entry (within last 1000), check the time difference
+    if [ -n "$most_recent_timestamp" ]; then
+        local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0")
+
+        if [ "$logged_time" -ne "0" ]; then
+            local time_diff=$((current_time - logged_time))
+            local hours_diff=$((time_diff / 3600))
+
+            # Skip if last agent timestamp was more recent than realert threshold
+            if ((hours_diff < REALERT_THRESHOLD)); then
+                return 0
+            fi
+        fi
+    fi
+
+    # Agent has not been logged within realert threshold
+    return 1
+}
+
+main() {
+    log_message "INFO" "Starting Fleet agent status check"
+
+    # Check if critical agents file is configured
+    if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
+        log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
+        log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
+    else
+        log_message "INFO" "No critical agents filter found, monitoring all agents"
+    fi
+
+    log_message "INFO" "Querying Fleet API"
+
+    local page=1
+    local total_agents=0
+    local processed_agents=0
+    local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+
+    {%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
+    log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}"
+    FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}"
+    {%- else %}
+    log_message "INFO" "Using default query (all offline or degraded agents)"
+    FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}"
+    {%- endif %}
+
+    while true; do
+        log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
+
+        if ! response_body=$(curl -K "$CURL_CONFIG" \
+            -s --fail \
+            "$FLEET_QUERY" \
+            -H 'kbn-xsrf: true' 2>/dev/null); then
+            log_message "ERROR" "Failed to query Fleet API (page $page)"
+            exit 1
+        fi
+
+        # pagination info
+        current_total=$(echo "$response_body" | jq -r '.total // 0')
+        current_page=$(echo "$response_body" | jq -r '.page // 1')
+        agents_in_page=$(echo "$response_body" | jq -r '.list | length')
+
+        # Update total
+        if [ "$page" -eq 1 ]; then
+            total_agents="$current_total"
+            log_message "INFO" "Found $total_agents total agents across all pages"
+        fi
+
+        log_message "INFO" "Processing page $current_page with $agents_in_page agents"
+
+        # Process agents from current page
+        mapfile -t agents < <(echo "$response_body" | jq -c '.list[]')
+
+        for agent in "${agents[@]}"; do
+            # Grab agent details
+            agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
+            agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
+            agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
+            agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
+            last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
+            last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
+            policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
+
+            # Only log agents that are offline or degraded (skip inactive agents)
+            # Fleetserver agents can show multiple versions as 'inactive'
+            if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
+                # Check if agent matches critical agent patterns (if configured)
+                if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
+                    log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent"
+                    continue  # Skip this agent if it doesn't match any critical agent pattern
+                fi
+
+                offline_hours=$(calculate_offline_hours "$last_checkin")
+
+                if [ "$offline_hours" -lt "$OFFLINE_THRESHOLD_HOURS" ]; then
+                    log_message "INFO" "${agent_hostname^^} has been offline for ${offline_hours}h (threshold: ${OFFLINE_THRESHOLD_HOURS}h). Not logging ${agent_status^^} agent until it reaches threshold"
+                    continue
+                fi
+
+                # Check if this agent was already logged within the realert_threshold
+                if check_recent_log_entries "$agent_hostname"; then
+                    log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h"
+                    continue
+                fi
+
+                log_entry=$(echo 'null' | jq -c \
+                    --arg ts "$current_timestamp" \
+                    --arg id "$agent_id" \
+                    --arg hostname "$agent_hostname" \
+                    --arg name "$agent_name" \
+                    --arg status "$agent_status" \
+                    --arg last_checkin "$last_checkin" \
+                    --arg last_checkin_status "$last_checkin_status" \
+                    --arg policy_id "$policy_id" \
+                    --arg offline_hours "$offline_hours" \
+                    '{
+                        "@timestamp": $ts,
+                        "agent.id": $id,
+                        "agent.hostname": $hostname,
+                        "agent.name": $name,
+                        "agent.status": $status,
+                        "agent.last_checkin": $last_checkin,
+                        "agent.last_checkin_status": $last_checkin_status,
+                        "agent.policy_id": $policy_id,
+                        "agent.offline_duration_hours": ($offline_hours | tonumber)
+                    }')
+
+                echo "$log_entry" >> "$LOG_FILE"
+
+                log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
+            fi
+        done
+
+        processed_agents=$((processed_agents + agents_in_page))
+
+        if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
+            log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
+            break
+        fi
+
+        page=$((page + 1))
+
+        # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
+        if [ "$page" -gt 100 ]; then
+            log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
+            break
+        fi
+    done
+
+    log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
+}
+
+main "$@"
--- a/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset
+++ b/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset
@@ -15,6 +15,7 @@ require_manager
 echo
 echo "This script will remove the current Elastic Fleet install and all of its data and then rerun Elastic Fleet setup."
 echo "Deployed Elastic Agents will no longer be enrolled and will need to be reinstalled."
+echo "Only the Elastic Fleet instance on the Manager will be reinstalled - dedicated Fleet node config will removed and will need to be reinstalled."
 echo "This script should only be used as a last resort to reinstall Elastic Fleet." 
 echo
 echo "If you would like to proceed, then type AGREE and press ENTER."
--- a/salt/nginx/etc/nginx.conf
+++ b/salt/nginx/etc/nginx.conf
@@ -196,19 +196,23 @@ http {
 		}

 		location / {
-			auth_request          /auth/sessions/whoami;
-			auth_request_set      $userid $upstream_http_x_kratos_authenticated_identity_id;
-			proxy_set_header      x-user-id $userid;
-			proxy_pass            http://{{ GLOBALS.manager }}:9822/;
-			proxy_read_timeout    300;
-			proxy_connect_timeout 300;
-			proxy_set_header      Host $host;
-			proxy_set_header      X-Real-IP $remote_addr;
-			proxy_set_header      X-Forwarded-For $proxy_add_x_forwarded_for;
-			proxy_set_header      Proxy "";
-			proxy_set_header      Upgrade $http_upgrade;
-			proxy_set_header      Connection "Upgrade";
-			proxy_set_header      X-Forwarded-Proto $scheme;
+			auth_request            /auth/sessions/whoami;
+			auth_request_set        $userid $upstream_http_x_kratos_authenticated_identity_id;
+			proxy_set_header        x-user-id $userid;
+			proxy_pass              http://{{ GLOBALS.manager }}:9822/;
+			proxy_read_timeout      300;
+			proxy_connect_timeout   300;
+			proxy_set_header        Host $host;
+			proxy_set_header        X-Real-IP $remote_addr;
+			proxy_set_header        X-Forwarded-For $proxy_add_x_forwarded_for;
+			proxy_set_header        Proxy "";
+			proxy_set_header        Upgrade $http_upgrade;
+			proxy_set_header        Connection "Upgrade";
+			proxy_set_header        X-Forwarded-Proto $scheme;
+
+			proxy_buffering         off;
+			proxy_cache             off;
+			proxy_request_buffering off;
 		}

 		location ~ ^/auth/.*?(login|oidc/callback) {
--- a/salt/repo/client/map.jinja
+++ b/salt/repo/client/map.jinja
@@ -26,9 +26,9 @@
         'rocky-devel.repo',
         'rocky-extras.repo',
         'rocky.repo',
-         'oracle-linux-ol9',
-         'uek-ol9',
-         'virt-oll9'
+         'oracle-linux-ol9.repo',
+         'uek-ol9.repo',
+         'virt-ol9.repo'
       ]
  %}
 {%     else %}
--- a/salt/sensoroni/config.sls
+++ b/salt/sensoroni/config.sls
@@ -18,6 +18,7 @@ sensoroniagentconf:
    - group: 939
    - mode: 600
    - template: jinja
+    - show_changes: False

 analyzersdir:
  file.directory:
--- a/salt/soc/defaults.yaml
+++ b/salt/soc/defaults.yaml
@@ -1491,6 +1491,8 @@ soc:
              - repo: file:///nsm/airgap-resources/playbooks/securityonion-resources-playbooks
                branch: main
                folder: securityonion-normalized
+        assistant:
+          apiUrl: https://onionai.securityonion.net
        salt:
          queueDir: /opt/sensoroni/queue
          timeoutMs: 45000
@@ -2541,3 +2543,12 @@ soc:
                          - ' -priv'
                  condition: all of selection_*
              level: 'high'  # info | low | medium | high | critical
+        assistant:
+          enabled: false
+          investigationPrompt: Investigate Alert ID {socid}
+          contextLimitSmall: 200000
+          contextLimitLarge: 1000000
+          thresholdColorRatioLow: 0.5
+          thresholdColorRatioMed: 0.75
+          thresholdColorRatioMax: 1
+          lowBalanceColorAlert: 500000
--- a/salt/soc/soc_soc.yaml
+++ b/salt/soc/soc_soc.yaml
@@ -580,7 +580,42 @@ soc:
              - field: folder
                label: Folder
            airgap: *pbRepos
+        assistant:
+          apiUrl:
+            description: The URL of the AI gateway.
+            advanced: True
+            global: True
      client:
+        assistant:
+          enabled:
+            description: Set to true to enable the Onion AI assistant in SOC.
+            global: True
+          investigationPrompt: 
+            description: Prompt given to Onion AI when beginning an investigation.
+            global: True
+          contextLimitSmall:
+            description: Smaller context limit for Onion AI.
+            global: True
+            advanced: True
+          contextLimitLarge:
+            description: Larger context limit for Onion AI.
+            global: True
+            advanced: True
+          thresholdColorRatioLow:
+            description: Lower visual context color change threshold.
+            global: True
+            advanced: True
+          thresholdColorRatioMed:
+            description: Middle visual context color change threshold.
+            global: True
+            advanced: True
+          thresholdColorRatioMax:
+            description: Max visual context color change threshold.
+            global: True
+            advanced: True 
+          lowBalanceColorAlert:
+            description: Onion AI credit amount at which balance turns red.
+            advanced: True
        apiTimeoutMs:
          description: Duration (in milliseconds) to wait for a response from the SOC server API before giving up and showing an error on the SOC UI.
          global: True
--- a/setup/so-functions
+++ b/setup/so-functions
@@ -29,46 +29,8 @@ title() {
 }

 fail_setup() {
-	local failure_reason="${1:-Unknown failure}"
-	
-	# Capture call stack information
-	local calling_function="${FUNCNAME[1]:-main}"
-	local calling_line="${BASH_LINENO[0]:-unknown}"
-	local calling_file="${BASH_SOURCE[1]:-unknown}"
-	
-	# Build call stack trace
-	local call_stack=""
-	local i=1
-	while [[ $i -lt ${#FUNCNAME[@]} ]]; do
-		local func="${FUNCNAME[$i]}"
-		local file="${BASH_SOURCE[$i]##*/}"  # Get basename only
-		local line="${BASH_LINENO[$((i-1))]}"
-		
-		if [[ -n "$call_stack" ]]; then
-			call_stack="$call_stack -> "
-		fi
-		call_stack="$call_stack$func($file:$line)"
-		((i++))
-	done
-	
-	# Enhanced error logging with call stack
-	error "FAILURE: Called from $calling_function() at line $calling_line"
-	error "REASON: $failure_reason"
-	error "STACK: $call_stack"
-	error "Setup encountered an unrecoverable failure: $failure_reason"
-	
-	# Create detailed failure file with enhanced information
-	{
-		echo "SETUP_FAILURE_TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC')"
-		echo "SETUP_FAILURE_REASON=$failure_reason"
-		echo "SETUP_CALLING_FUNCTION=$calling_function"
-		echo "SETUP_CALLING_LINE=$calling_line"
-		echo "SETUP_CALLING_FILE=${calling_file##*/}"
-		echo "SETUP_CALL_STACK=$call_stack"
-		echo "SETUP_LOG_LOCATION=$setup_log"
-		echo "SETUP_FAILURE_DETAILS=Check $setup_log for complete error details"
-	} > /root/failure
-	
+	error "Setup encountered an unrecoverable failure, exiting"
+	touch /root/failure
 	exit 1
 }