From e26310d1727807f245e2d94f3caef7689149665b Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Tue, 2 Sep 2025 17:00:03 -0500 Subject: [PATCH 01/27] elastic agent offline alerter Signed-off-by: reyesj2 <94730068+reyesj2@users.noreply.github.com> --- .../elastic-agent-monitor.json | 48 +++++ salt/elasticsearch/defaults.yaml | 64 ++++++ .../so-elastic-agent-monitor.json | 43 ++++ salt/manager/defaults.yaml | 7 + salt/manager/init.sls | 35 ++++ salt/manager/soc_manager.yaml | 30 +++ .../tools/sbin_jinja/so-elastic-agent-monitor | 193 ++++++++++++++++++ 7 files changed, 420 insertions(+) create mode 100644 salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json create mode 100644 salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json create mode 100644 salt/manager/tools/sbin_jinja/so-elastic-agent-monitor diff --git a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json new file mode 100644 index 000000000..a7d425b39 --- /dev/null +++ b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json @@ -0,0 +1,48 @@ +{ + "package": { + "name": "filestream", + "version": "" + }, + "name": "agent-monitor", + "namespace": "", + "description": "", + "policy_ids": [ + "so-grid-nodes_general" + ], + "output_id": null, + "vars": {}, + "inputs": { + "filestream-filestream": { + "enabled": true, + "streams": { + "filestream.generic": { + "enabled": true, + "vars": { + "paths": [ + "/opt/so/log/agents/agent-monitor-*.log" + ], + "data_stream.dataset": "agent-monitor", + "pipeline": "elasticagent.monitor", + "parsers": "", + "exclude_files": [ + "\\.gz$" + ], + "include_files": [], + "processors": "- decode_json_fields:\n fields: [\"message\"]\n target: \"\"\n- add_fields:\n target: event\n fields:\n module: gridmetrics", + "tags": [], + "recursive_glob": true, + "ignore_older": "72h", + "clean_inactive": -1, + "harvester_limit": 0, + "fingerprint": true, + "fingerprint_offset": 0, + "fingerprint_length": 1024, + "file_identity_native": false, + "exclude_lines": [], + "include_lines": [] + } + } + } + } + } +} diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml index e08978e0d..7e3078ccf 100644 --- a/salt/elasticsearch/defaults.yaml +++ b/salt/elasticsearch/defaults.yaml @@ -1243,6 +1243,70 @@ elasticsearch: set_priority: priority: 50 min_age: 30d + so-logs-agent-monitor: + index_sorting: false + index_template: + composed_of: + - event-mappings + - so-elastic-agent-monitor + - so-fleet_integrations.ip_mappings-1 + - so-fleet_globals-1 + - so-fleet_agent_id_verification-1 + data_stream: + allow_custom_routing: false + hidden: false + ignore_missing_component_templates: + - logs-agent-monitor@custom + index_patterns: + - logs-agent-monitor-* + priority: 501 + template: + mappings: + _meta: + managed: true + managed_by: security_onion + package: + name: elastic_agent + settings: + index: + lifecycle: + name: so-logs-agent-monitor-logs + mapping: + total_fields: + limit: 5000 + number_of_replicas: 0 + sort: + field: '@timestamp' + order: desc + policy: + _meta: + managed: true + managed_by: security_onion + package: + name: elastic_agent + phases: + cold: + actions: + set_priority: + priority: 0 + min_age: 60d + delete: + actions: + delete: {} + min_age: 365d + hot: + actions: + rollover: + max_age: 30d + max_primary_shard_size: 50gb + set_priority: + priority: 100 + min_age: 0ms + warm: + actions: + set_priority: + priority: 50 + min_age: 30d so-logs-elastic_agent_x_apm_server: index_sorting: false index_template: diff --git a/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json new file mode 100644 index 000000000..50440fbed --- /dev/null +++ b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json @@ -0,0 +1,43 @@ +{ + "template": { + "mappings": { + "properties": { + "agent": { + "type": "object", + "properties": { + "hostname": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "last_checkin_status": { + "ignore_above": 1024, + "type": "keyword" + }, + "last_checkin": { + "type": "date" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "offline_duration_hours": { + "type": "integer" + }, + "policy_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "status": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + } + } +} \ No newline at end of file diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml index 708900af6..65247d8ff 100644 --- a/salt/manager/defaults.yaml +++ b/salt/manager/defaults.yaml @@ -5,3 +5,10 @@ manager: minute: 0 additionalCA: '' insecureSkipVerify: False + agent_monitoring: + enabled: False + config: + critical_agents: [] + offline_threshold: 5 + page_size: 250 + run_interval: 5 diff --git a/salt/manager/init.sls b/salt/manager/init.sls index 737d753f4..7daaeb8f2 100644 --- a/salt/manager/init.sls +++ b/salt/manager/init.sls @@ -34,6 +34,26 @@ agents_log_dir: - user - group +agents_conf_dir: + file.directory: + - name: /opt/so/conf/agents + - user: root + - group: root + - recurse: + - user + - group + +{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %} +critical_agents_patterns: + file.managed: + - name: /opt/so/conf/agents/critical-agents.txt + - contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }} +{% else %} +remove_critical_agents_config: + file.absent: + - name: /opt/so/conf/agents/critical-agents.txt +{% endif %} + yara_log_dir: file.directory: - name: /opt/so/log/yarasync @@ -127,6 +147,21 @@ so_fleetagent_status: - month: '*' - dayweek: '*' +so_fleetagent_monitor: +{% if MANAGERMERGED.agent_monitoring.enabled %} + cron.present: +{% else %} + cron.absent: +{% endif %} + - name: /usr/sbin/so-elastic-agent-monitor + - identifier: so_fleetagent_monitor + - user: root + - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}' + - hour: '*' + - daymonth: '*' + - month: '*' + - dayweek: '*' + socore_own_saltstack_default: file.directory: - name: /opt/so/saltstack/default diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml index cf78658de..f69f3f42a 100644 --- a/salt/manager/soc_manager.yaml +++ b/salt/manager/soc_manager.yaml @@ -37,3 +37,33 @@ manager: forcedType: bool global: True helpLink: proxy.html + agent_monitoring: + enabled: + description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold. + global: True + helpLink: elastic-fleet.html + forcedType: bool + config: + critical_agents: + description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold + global: True + multiline: True + helpLink: elastic-fleet.html + forcedType: "[]string" + offline_threshold: + description: The maximum allowed time in hours a 'critical' agent has been offline before being logged. + global: True + helpLink: elastic-fleet.html + forcedType: int + page_size: + description: The amount of agents that can be processed per API request to fleet. + global: True + helpLink: elastic-fleet.html + forcedType: int + advanced: True + run_interval: + description: The time in minutes between checking fleet agent statuses. + global: True + advanced: True + helpLink: elastic-fleet.html + forcedType: int diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor new file mode 100644 index 000000000..572d4de4d --- /dev/null +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -0,0 +1,193 @@ +#!/bin/bash + +{% from 'manager/map.jinja' import MANAGERMERGED %} +{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %} +{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %} + +set -euo pipefail + +LOG_DIR="/opt/so/log/agents" +LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log" +CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config" +FLEET_API="http://localhost:5601/api/fleet/agents" +CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" + +OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} +PAGE_SIZE="{{ PAGE_SIZE }}" + +log_message() { + local level="$1" + local message="$2" + echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2 +} + +matches_critical_pattern() { + local hostname="$1" + local pattern_file="$2" + + # If critical agents file doesn't exist or is empty, match all + if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then + return 0 + fi + + local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]') + + while IFS= read -r pattern || [ -n "$pattern" ]; do + # empty lines and comments + [[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue + + # cut whitespace + pattern=$(echo "$pattern" | xargs) + + local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]') + + # Replace * with bash wildcard + local bash_pattern="${pattern_lower//\*/.*}" + + # Check if hostname matches the pattern + if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then + return 0 + fi + done < "$pattern_file" + + return 1 +} + +calculate_offline_hours() { + local last_checkin="$1" + local current_time=$(date +%s) + local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0") + + if [ "$checkin_time" -eq "0" ]; then + echo "0" + return + fi + + local diff=$((current_time - checkin_time)) + echo $((diff / 3600)) +} + +cleanup_old_logs() { + # Find and delete log files older than 7 days + local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null) + + if [ -n "$old_files" ]; then + local deleted_count=$(echo "$old_files" | wc -l) + echo "$old_files" | xargs rm -f + log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)" + fi +} + +main() { + log_message "INFO" "Starting Fleet agent status check" + + # Check if critical agents file is configured + if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then + log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE" + log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')" + else + log_message "INFO" "No critical agents filter found, monitoring all agents" + fi + + cleanup_old_logs + + log_message "INFO" "Querying Fleet API" + + local page=1 + local total_agents=0 + local processed_agents=0 + local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + while true; do + log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)" + + if ! response_body=$(curl -K "$CURL_CONFIG" \ + -s --fail \ + "${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \ + -H 'kbn-xsrf: true' 2>/dev/null); then + log_message "ERROR" "Failed to query Fleet API (page $page)" + exit 1 + fi + + # pagination info + current_total=$(echo "$response_body" | jq -r '.total // 0') + current_page=$(echo "$response_body" | jq -r '.page // 1') + agents_in_page=$(echo "$response_body" | jq -r '.list | length') + + # Update total + if [ "$page" -eq 1 ]; then + total_agents="$current_total" + log_message "INFO" "Found $total_agents total agents across all pages" + fi + + log_message "INFO" "Processing page $current_page with $agents_in_page agents" + + # Process agents from current page + echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do + # Grab agent details + agent_id=$(echo "$agent" | jq -r '.id // "unknown"') + agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"') + agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"') + agent_status=$(echo "$agent" | jq -r '.status // "unknown"') + last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""') + last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"') + policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"') + + # Only log agents that are offline or degraded (skip inactive agents) + # Fleetserver agents can show multiple versions as 'inactive' + if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then + # Check if agent matches critical agent patterns (if configured) + if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then + continue # Skip this agent if it doesn't match any critical agent pattern + fi + + offline_hours=$(calculate_offline_hours "$last_checkin") + + log_entry=$(jq -c \ + --arg ts "$current_timestamp" \ + --arg id "$agent_id" \ + --arg hostname "$agent_hostname" \ + --arg name "$agent_name" \ + --arg status "$agent_status" \ + --arg last_checkin "$last_checkin" \ + --arg last_checkin_status "$last_checkin_status" \ + --arg policy_id "$policy_id" \ + --arg offline_hours "$offline_hours" \ + '{ + "@timestamp": $ts, + "agent.id": $id, + "agent.hostname": $hostname, + "agent.name": $name, + "agent.status": $status, + "agent.last_checkin": $last_checkin, + "agent.last_checkin_status": $last_checkin_status, + "agent.policy_id": $policy_id, + "agent.offline_duration_hours": ($offline_hours | tonumber) + }') + + echo "$log_entry" >> "$LOG_FILE" + + log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)" + fi + done + + processed_agents=$((processed_agents + agents_in_page)) + + if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then + log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents" + break + fi + + page=$((page + 1)) + + # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size + if [ "$page" -gt 100 ]; then + log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size" + break + fi + done + + log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents" +} + +main "$@" From dfec29d18e7981efd66df8e5d639bc4c6c1c7b80 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Thu, 4 Sep 2025 15:37:28 -0500 Subject: [PATCH 02/27] custom kquery --- salt/elasticfleet/defaults.yaml | 1 + .../files/ingest/elasticagent.monitor | 36 ++++++ salt/manager/defaults.yaml | 1 + salt/manager/soc_manager.yaml | 8 +- .../tools/sbin_jinja/so-elastic-agent-monitor | 121 ++++++++++-------- 5 files changed, 113 insertions(+), 54 deletions(-) create mode 100644 salt/elasticsearch/files/ingest/elasticagent.monitor diff --git a/salt/elasticfleet/defaults.yaml b/salt/elasticfleet/defaults.yaml index d6cdd7351..0220428bf 100644 --- a/salt/elasticfleet/defaults.yaml +++ b/salt/elasticfleet/defaults.yaml @@ -38,6 +38,7 @@ elasticfleet: - elasticsearch - endpoint - fleet_server + - filestream - http_endpoint - httpjson - log diff --git a/salt/elasticsearch/files/ingest/elasticagent.monitor b/salt/elasticsearch/files/ingest/elasticagent.monitor new file mode 100644 index 000000000..09d8297c4 --- /dev/null +++ b/salt/elasticsearch/files/ingest/elasticagent.monitor @@ -0,0 +1,36 @@ +{ + "processors": [ + { + "set": { + "field": "event.dataset", + "value": "gridmetrics.agents", + "ignore_failure": true + } + }, + { + "set": { + "field": "event.module", + "value": "gridmetrics", + "ignore_failure": true + } + }, + { + "remove": { + "field": [ + "host", + "elastic_agent", + "agent" + ], + "ignore_missing": true, + "ignore_failure": true + } + }, + { + "json": { + "field": "message", + "add_to_root": true, + "ignore_failure": true + } + } + ] +} \ No newline at end of file diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml index 65247d8ff..237ac2999 100644 --- a/salt/manager/defaults.yaml +++ b/salt/manager/defaults.yaml @@ -9,6 +9,7 @@ manager: enabled: False config: critical_agents: [] + custom_kquery: offline_threshold: 5 page_size: 250 run_interval: 5 diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml index f69f3f42a..ac06ac2b4 100644 --- a/salt/manager/soc_manager.yaml +++ b/salt/manager/soc_manager.yaml @@ -45,11 +45,17 @@ manager: forcedType: bool config: critical_agents: - description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold + description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold. global: True multiline: True helpLink: elastic-fleet.html forcedType: "[]string" + custom_kquery: + description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA' + global: True + helpLink: elastic-fleet.html + forcedType: string + advanced: True offline_threshold: description: The maximum allowed time in hours a 'critical' agent has been offline before being logged. global: True diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor index 572d4de4d..0f3bcac34 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -1,17 +1,21 @@ +{%- from 'manager/map.jinja' import MANAGERMERGED -%} +{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%} +{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%} +{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%} #!/bin/bash -{% from 'manager/map.jinja' import MANAGERMERGED %} -{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold %} -{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size %} - set -euo pipefail LOG_DIR="/opt/so/log/agents" LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log" CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config" FLEET_API="http://localhost:5601/api/fleet/agents" +{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #} +{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %} +CRITICAL_AGENTS_FILE="/dev/null" +{%- else %} CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" - +{%- endif %} OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} PAGE_SIZE="{{ PAGE_SIZE }}" @@ -80,7 +84,7 @@ cleanup_old_logs() { main() { log_message "INFO" "Starting Fleet agent status check" - + # Check if critical agents file is configured if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE" @@ -98,12 +102,20 @@ main() { local processed_agents=0 local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + {%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %} + log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}" + FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}" + {%- else %} + log_message "INFO" "Using default query (all offline or degraded agents)" + FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}" + {%- endif %} + while true; do log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)" if ! response_body=$(curl -K "$CURL_CONFIG" \ -s --fail \ - "${FLEET_API}?perPage=${PAGE_SIZE}&page=${page}&showInactive=true" \ + $FLEET_QUERY \ -H 'kbn-xsrf: true' 2>/dev/null); then log_message "ERROR" "Failed to query Fleet API (page $page)" exit 1 @@ -123,52 +135,55 @@ main() { log_message "INFO" "Processing page $current_page with $agents_in_page agents" # Process agents from current page - echo "$response_body" | jq -c '.list[]' | while IFS= read -r agent; do - # Grab agent details - agent_id=$(echo "$agent" | jq -r '.id // "unknown"') - agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"') - agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"') - agent_status=$(echo "$agent" | jq -r '.status // "unknown"') - last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""') - last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"') - policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"') + mapfile -t agents < <(echo "$response_body" | jq -c '.list[]') - # Only log agents that are offline or degraded (skip inactive agents) - # Fleetserver agents can show multiple versions as 'inactive' - if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then - # Check if agent matches critical agent patterns (if configured) - if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then - continue # Skip this agent if it doesn't match any critical agent pattern + for agent in "${agents[@]}"; do + # Grab agent details + agent_id=$(echo "$agent" | jq -r '.id // "unknown"') + agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"') + agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"') + agent_status=$(echo "$agent" | jq -r '.status // "unknown"') + last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""') + last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"') + policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"') + + # Only log agents that are offline or degraded (skip inactive agents) + # Fleetserver agents can show multiple versions as 'inactive' + if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then + # Check if agent matches critical agent patterns (if configured) + if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then + log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent" + continue # Skip this agent if it doesn't match any critical agent pattern + fi + + offline_hours=$(calculate_offline_hours "$last_checkin") + + log_entry=$(echo 'null' | jq -c \ + --arg ts "$current_timestamp" \ + --arg id "$agent_id" \ + --arg hostname "$agent_hostname" \ + --arg name "$agent_name" \ + --arg status "$agent_status" \ + --arg last_checkin "$last_checkin" \ + --arg last_checkin_status "$last_checkin_status" \ + --arg policy_id "$policy_id" \ + --arg offline_hours "$offline_hours" \ + '{ + "@timestamp": $ts, + "agent.id": $id, + "agent.hostname": $hostname, + "agent.name": $name, + "agent.status": $status, + "agent.last_checkin": $last_checkin, + "agent.last_checkin_status": $last_checkin_status, + "agent.policy_id": $policy_id, + "agent.offline_duration_hours": ($offline_hours | tonumber) + }') + + echo "$log_entry" >> "$LOG_FILE" + + log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)" fi - - offline_hours=$(calculate_offline_hours "$last_checkin") - - log_entry=$(jq -c \ - --arg ts "$current_timestamp" \ - --arg id "$agent_id" \ - --arg hostname "$agent_hostname" \ - --arg name "$agent_name" \ - --arg status "$agent_status" \ - --arg last_checkin "$last_checkin" \ - --arg last_checkin_status "$last_checkin_status" \ - --arg policy_id "$policy_id" \ - --arg offline_hours "$offline_hours" \ - '{ - "@timestamp": $ts, - "agent.id": $id, - "agent.hostname": $hostname, - "agent.name": $name, - "agent.status": $status, - "agent.last_checkin": $last_checkin, - "agent.last_checkin_status": $last_checkin_status, - "agent.policy_id": $policy_id, - "agent.offline_duration_hours": ($offline_hours | tonumber) - }') - - echo "$log_entry" >> "$LOG_FILE" - - log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)" - fi done processed_agents=$((processed_agents + agents_in_page)) @@ -180,13 +195,13 @@ main() { page=$((page + 1)) - # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size + # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size if [ "$page" -gt 100 ]; then log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size" break fi done - + log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents" } From 915b9e7bd7cc8c8a6aa057a6fe084bc574aa1f91 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Fri, 5 Sep 2025 09:22:44 -0500 Subject: [PATCH 03/27] use logrotate --- salt/logrotate/defaults.yaml | 9 +++++++++ salt/logrotate/soc_logrotate.yaml | 7 +++++++ .../tools/sbin_jinja/so-elastic-agent-monitor | 17 ++--------------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/salt/logrotate/defaults.yaml b/salt/logrotate/defaults.yaml index 2f7247ff2..479b598f5 100644 --- a/salt/logrotate/defaults.yaml +++ b/salt/logrotate/defaults.yaml @@ -268,3 +268,12 @@ logrotate: - nocompress - create - sharedscripts + /opt/so/log/agents/agent-monitor*_x_log: + - daily + - rotate 14 + - missingok + - compress + - create + - extension .log + - dateext + - dateyesterday \ No newline at end of file diff --git a/salt/logrotate/soc_logrotate.yaml b/salt/logrotate/soc_logrotate.yaml index 56f879e4f..6f0272ef0 100644 --- a/salt/logrotate/soc_logrotate.yaml +++ b/salt/logrotate/soc_logrotate.yaml @@ -175,3 +175,10 @@ logrotate: multiline: True global: True forcedType: "[]string" + "/opt/so/log/agents/agent-monitor*_x_log": + description: List of logrotate options for this file. + title: /opt/so/log/agents/agent-monitor*.log + advanced: True + multiline: True + global: True + forcedType: "[]string" diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor index 0f3bcac34..0b40925fd 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -7,7 +7,7 @@ set -euo pipefail LOG_DIR="/opt/so/log/agents" -LOG_FILE="$LOG_DIR/agent-monitor-$(date -u +"%Y%m%d").log" +LOG_FILE="$LOG_DIR/agent-monitor.log" CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config" FLEET_API="http://localhost:5601/api/fleet/agents" {#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #} @@ -71,17 +71,6 @@ calculate_offline_hours() { echo $((diff / 3600)) } -cleanup_old_logs() { - # Find and delete log files older than 7 days - local old_files=$(find "$LOG_DIR" -name "agent-monitor-*.log" -type f -mtime +7 2>/dev/null) - - if [ -n "$old_files" ]; then - local deleted_count=$(echo "$old_files" | wc -l) - echo "$old_files" | xargs rm -f - log_message "INFO" "Cleaned up $deleted_count old log files (>7 days)" - fi -} - main() { log_message "INFO" "Starting Fleet agent status check" @@ -92,8 +81,6 @@ main() { else log_message "INFO" "No critical agents filter found, monitoring all agents" fi - - cleanup_old_logs log_message "INFO" "Querying Fleet API" @@ -115,7 +102,7 @@ main() { if ! response_body=$(curl -K "$CURL_CONFIG" \ -s --fail \ - $FLEET_QUERY \ + "$FLEET_QUERY" \ -H 'kbn-xsrf: true' 2>/dev/null); then log_message "ERROR" "Failed to query Fleet API (page $page)" exit 1 From 348f9dcaec75bcdbbdfa47dd7d2a86f4d73316df Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Fri, 5 Sep 2025 10:01:24 -0500 Subject: [PATCH 04/27] prevent multiple script instances using file lock --- salt/manager/init.sls | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/manager/init.sls b/salt/manager/init.sls index 7daaeb8f2..f59c33652 100644 --- a/salt/manager/init.sls +++ b/salt/manager/init.sls @@ -153,7 +153,7 @@ so_fleetagent_monitor: {% else %} cron.absent: {% endif %} - - name: /usr/sbin/so-elastic-agent-monitor + - name: /bin/flock -n /opt/so/log/agents/agent-monitor.lock /usr/sbin/so-elastic-agent-monitor - identifier: so_fleetagent_monitor - user: root - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}' From 4afc986f484789214fd923a9b633c3f06e218f2c Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 5 Sep 2025 13:14:47 -0400 Subject: [PATCH 05/27] firewall and logstash pipeline for managerhype --- salt/firewall/defaults.yaml | 16 ++++++++++++++++ salt/firewall/map.jinja | 10 +++++----- salt/logstash/map.jinja | 4 ++-- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/salt/firewall/defaults.yaml b/salt/firewall/defaults.yaml index 0c43b8c0b..a11492e88 100644 --- a/salt/firewall/defaults.yaml +++ b/salt/firewall/defaults.yaml @@ -1230,6 +1230,10 @@ firewall: portgroups: - elasticsearch_node - elasticsearch_rest + managerhype: + portgroups: + - elasticsearch_node + - elasticsearch_rest standalone: portgroups: - elasticsearch_node @@ -1377,6 +1381,10 @@ firewall: portgroups: - elasticsearch_node - elasticsearch_rest + managerhype: + portgroups: + - elasticsearch_node + - elasticsearch_rest standalone: portgroups: - elasticsearch_node @@ -1579,6 +1587,9 @@ firewall: portgroups: - redis - elastic_agent_data + managerhype: + portgroups: + - elastic_agent_data self: portgroups: - redis @@ -1696,6 +1707,9 @@ firewall: managersearch: portgroups: - openssh + managerhype: + portgroups: + - openssh standalone: portgroups: - openssh @@ -1758,6 +1772,8 @@ firewall: portgroups: [] managersearch: portgroups: [] + managerhype: + portgroups: [] standalone: portgroups: [] customhostgroup0: diff --git a/salt/firewall/map.jinja b/salt/firewall/map.jinja index 4347d2b31..8bd0512ec 100644 --- a/salt/firewall/map.jinja +++ b/salt/firewall/map.jinja @@ -25,7 +25,7 @@ {% set KAFKA_EXTERNAL_ACCESS = salt['pillar.get']('kafka:config:external_access:enabled', default=False) %} {% set kafka_node_type = salt['pillar.get']('kafka:nodes:'+ GLOBALS.hostname + ':role') %} -{% if role in ['manager', 'managersearch', 'standalone'] %} +{% if role.startswith('manager') or role == 'standalone' %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[role].portgroups.append('kafka_controller') %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.receiver.portgroups.append('kafka_controller') %} {% endif %} @@ -38,8 +38,8 @@ {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.receiver.portgroups.append('kafka_controller') %} {% endif %} -{% if role in ['manager', 'managersearch', 'standalone', 'receiver'] %} -{% for r in ['manager', 'managersearch', 'standalone', 'receiver', 'fleet', 'idh', 'sensor', 'searchnode','heavynode', 'elastic_agent_endpoint', 'desktop'] %} +{% if role.startswith('manager') or role in ['standalone', 'receiver'] %} +{% for r in ['manager', 'managersearch', 'managerhype', 'standalone', 'receiver', 'fleet', 'idh', 'sensor', 'searchnode','heavynode', 'elastic_agent_endpoint', 'desktop'] %} {% if FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r] is defined %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r].portgroups.append('kafka_data') %} {% endif %} @@ -48,11 +48,11 @@ {% if KAFKA_EXTERNAL_ACCESS %} {# Kafka external access only applies for Kafka nodes with the broker role. #} -{% if role in ['manager', 'managersearch', 'standalone', 'receiver'] and 'broker' in kafka_node_type %} +{% if role.startswith('manager') or role in ['standalone', 'receiver'] and 'broker' in kafka_node_type %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.external_kafka.portgroups.append('kafka_external_access') %} {% endif %} {% endif %} {% endif %} -{% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %} \ No newline at end of file +{% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %} diff --git a/salt/logstash/map.jinja b/salt/logstash/map.jinja index 95ec6b85d..5aad1daa9 100644 --- a/salt/logstash/map.jinja +++ b/salt/logstash/map.jinja @@ -17,7 +17,7 @@ {% for node_type, node_details in redis_node_data.items() | sort %} {% if GLOBALS.role in ['so-searchnode', 'so-standalone', 'so-managersearch', 'so-fleet'] %} -{% if node_type in ['manager', 'managersearch', 'standalone', 'receiver' ] %} +{% if node_type.startswith('manager') or node_type in ['standalone', 'receiver'] %} {% for hostname in redis_node_data[node_type].keys() %} {% do LOGSTASH_REDIS_NODES.append({hostname:node_details[hostname].ip}) %} {% endfor %} @@ -47,7 +47,7 @@ {% endif %} {# Disable logstash on manager & receiver nodes unless it has an override configured #} {% if not KAFKA_LOGSTASH %} -{% if GLOBALS.role in ['so-manager', 'so-receiver'] and GLOBALS.hostname not in KAFKA_LOGSTASH %} +{% if GLOBALS.role in ['so-manager', 'so-managerhype', 'so-receiver'] and GLOBALS.hostname not in KAFKA_LOGSTASH %} {% do LOGSTASH_MERGED.update({'enabled': False}) %} {% endif %} {% endif %} From 207572f2f94c48fd51fbdf18f7dad47830d849c3 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 5 Sep 2025 14:16:03 -0400 Subject: [PATCH 06/27] remove debug added to fail_setup --- setup/so-functions | 42 ++---------------------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/setup/so-functions b/setup/so-functions index dbe198958..9ab11a904 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -29,46 +29,8 @@ title() { } fail_setup() { - local failure_reason="${1:-Unknown failure}" - - # Capture call stack information - local calling_function="${FUNCNAME[1]:-main}" - local calling_line="${BASH_LINENO[0]:-unknown}" - local calling_file="${BASH_SOURCE[1]:-unknown}" - - # Build call stack trace - local call_stack="" - local i=1 - while [[ $i -lt ${#FUNCNAME[@]} ]]; do - local func="${FUNCNAME[$i]}" - local file="${BASH_SOURCE[$i]##*/}" # Get basename only - local line="${BASH_LINENO[$((i-1))]}" - - if [[ -n "$call_stack" ]]; then - call_stack="$call_stack -> " - fi - call_stack="$call_stack$func($file:$line)" - ((i++)) - done - - # Enhanced error logging with call stack - error "FAILURE: Called from $calling_function() at line $calling_line" - error "REASON: $failure_reason" - error "STACK: $call_stack" - error "Setup encountered an unrecoverable failure: $failure_reason" - - # Create detailed failure file with enhanced information - { - echo "SETUP_FAILURE_TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" - echo "SETUP_FAILURE_REASON=$failure_reason" - echo "SETUP_CALLING_FUNCTION=$calling_function" - echo "SETUP_CALLING_LINE=$calling_line" - echo "SETUP_CALLING_FILE=${calling_file##*/}" - echo "SETUP_CALL_STACK=$call_stack" - echo "SETUP_LOG_LOCATION=$setup_log" - echo "SETUP_FAILURE_DETAILS=Check $setup_log for complete error details" - } > /root/failure - + error "Setup encountered an unrecoverable failure, exiting" + touch /root/failure exit 1 } From f318a84c1806699f101bc332771bb40567535b62 Mon Sep 17 00:00:00 2001 From: Josh Brower Date: Mon, 8 Sep 2025 09:03:33 -0400 Subject: [PATCH 07/27] Update so-elastic-fleet-reset --- salt/manager/tools/sbin_jinja/so-elastic-fleet-reset | 1 + 1 file changed, 1 insertion(+) diff --git a/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset b/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset index 0b116564d..1e32268da 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset +++ b/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset @@ -15,6 +15,7 @@ require_manager echo echo "This script will remove the current Elastic Fleet install and all of its data and then rerun Elastic Fleet setup." echo "Deployed Elastic Agents will no longer be enrolled and will need to be reinstalled." +echo "Only the Elastic Fleet instance on the Manager will be reinstalled - dedicated Fleet node config will removed and will need to be reinstalled." echo "This script should only be used as a last resort to reinstall Elastic Fleet." echo echo "If you would like to proceed, then type AGREE and press ENTER." From ec27517bdd7a29f2f0f5532b1c9ba84df0d1ac45 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Fri, 11 Jul 2025 10:37:50 -0600 Subject: [PATCH 08/27] New Config Values New config values with annotations and defaults. Updated Nginx config to allow streaming requests to not be buffered on the way to the client. --- salt/nginx/etc/nginx.conf | 30 +++++++++++++++++------------- salt/soc/defaults.yaml | 4 ++++ salt/soc/soc_soc.yaml | 11 +++++++++++ 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/salt/nginx/etc/nginx.conf b/salt/nginx/etc/nginx.conf index 742f5d08d..caa05bbff 100644 --- a/salt/nginx/etc/nginx.conf +++ b/salt/nginx/etc/nginx.conf @@ -196,19 +196,23 @@ http { } location / { - auth_request /auth/sessions/whoami; - auth_request_set $userid $upstream_http_x_kratos_authenticated_identity_id; - proxy_set_header x-user-id $userid; - proxy_pass http://{{ GLOBALS.manager }}:9822/; - proxy_read_timeout 300; - proxy_connect_timeout 300; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Proxy ""; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "Upgrade"; - proxy_set_header X-Forwarded-Proto $scheme; + auth_request /auth/sessions/whoami; + auth_request_set $userid $upstream_http_x_kratos_authenticated_identity_id; + proxy_set_header x-user-id $userid; + proxy_pass http://{{ GLOBALS.manager }}:9822/; + proxy_read_timeout 300; + proxy_connect_timeout 300; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Proxy ""; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_buffering off; + proxy_cache off; + proxy_request_buffering off; } location ~ ^/auth/.*?(login|oidc/callback) { diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index 7bb2c1f03..c86889be7 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -1491,6 +1491,10 @@ soc: - repo: file:///nsm/airgap-resources/playbooks/securityonion-resources-playbooks branch: main folder: securityonion-normalized + assistant: + apiKey: + apiUrl: https://onionai-dev.securityonion.net + model: claude-sonnet salt: queueDir: /opt/sensoroni/queue timeoutMs: 45000 diff --git a/salt/soc/soc_soc.yaml b/salt/soc/soc_soc.yaml index 2d0eb3792..b8133999f 100644 --- a/salt/soc/soc_soc.yaml +++ b/salt/soc/soc_soc.yaml @@ -580,6 +580,17 @@ soc: - field: folder label: Folder airgap: *pbRepos + assistant: + apiKey: + description: The auth token to be used when reaching out to the AI Assistant. + global: True + apiUrl: + description: The URL of the AI gateway. + advanced: True + global: True + model: + description: The model to use as the AI Assistant + global: True client: apiTimeoutMs: description: Duration (in milliseconds) to wait for a response from the SOC server API before giving up and showing an error on the SOC UI. From ba601c39b37e8d2aa3158849d903753d4af3653f Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Tue, 29 Jul 2025 11:23:28 -0600 Subject: [PATCH 09/27] Rough Go at New Mappings/Settings --- .../component/so/assistant-mappings.json | 48 +++++++++++++++++++ .../component/so/assistant-settings.json | 7 +++ 2 files changed, 55 insertions(+) create mode 100644 salt/elasticsearch/templates/component/so/assistant-mappings.json create mode 100644 salt/elasticsearch/templates/component/so/assistant-settings.json diff --git a/salt/elasticsearch/templates/component/so/assistant-mappings.json b/salt/elasticsearch/templates/component/so/assistant-mappings.json new file mode 100644 index 000000000..ad17ebd66 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-mappings.json @@ -0,0 +1,48 @@ +{ + "template": { + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "so_kind": { + "ignore_above": 1024, + "type": "keyword" + }, + "so_operation": { + "ignore_above": 1024, + "type": "keyword" + }, + "so_chat": { + "properties": { + "role": { + "ignore_above": 1024, + "type": "keyword" + }, + "content": { + "type": "text" + }, + "conversation_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "createTime": { + "type": "date" + }, + "tool_use_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "userId": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + } + }, + "_meta": { + "ecs_version": "1.12.2" + } +} diff --git a/salt/elasticsearch/templates/component/so/assistant-settings.json b/salt/elasticsearch/templates/component/so/assistant-settings.json new file mode 100644 index 000000000..0281fa0e1 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-settings.json @@ -0,0 +1,7 @@ +{ + "template": {}, + "version": 1, + "_meta": { + "description": "default settings for common Security Onion Assistant indices" + } +} From 6323fbf46b3b27bfc6bc8bb0caf9152a04892892 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Wed, 30 Jul 2025 11:48:27 -0600 Subject: [PATCH 10/27] Content Object --- .../templates/component/so/assistant-mappings.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/salt/elasticsearch/templates/component/so/assistant-mappings.json b/salt/elasticsearch/templates/component/so/assistant-mappings.json index ad17ebd66..f6f552465 100644 --- a/salt/elasticsearch/templates/component/so/assistant-mappings.json +++ b/salt/elasticsearch/templates/component/so/assistant-mappings.json @@ -20,7 +20,8 @@ "type": "keyword" }, "content": { - "type": "text" + "type": "object", + "enabled": false }, "conversation_id": { "ignore_above": 1024, From b1753f86f91b8345151f16de7eb9a06d2145fbe9 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Wed, 30 Jul 2025 13:14:09 -0600 Subject: [PATCH 11/27] New Message Structure --- .../component/so/assistant-mappings.json | 50 ++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/salt/elasticsearch/templates/component/so/assistant-mappings.json b/salt/elasticsearch/templates/component/so/assistant-mappings.json index f6f552465..89a907165 100644 --- a/salt/elasticsearch/templates/component/so/assistant-mappings.json +++ b/salt/elasticsearch/templates/component/so/assistant-mappings.json @@ -23,7 +23,7 @@ "type": "object", "enabled": false }, - "conversation_id": { + "sessionId": { "ignore_above": 1024, "type": "keyword" }, @@ -37,6 +37,54 @@ "userId": { "ignore_above": 1024, "type": "keyword" + }, + "message": { + "properties": { + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + }, + "role": { + "ignore_above": 1024, + "type": "keyword" + }, + "model": { + "ignore_above": 1024, + "type": "keyword" + }, + "contentStr": { + "type": "text" + }, + "contentBlocks": { + "type": "nested", + "enabled": false + }, + "stopReason": { + "ignore_above": 1024, + "type": "keyword" + }, + "stopSequence": { + "ignore_above": 1024, + "type": "keyword" + }, + "usage": { + "properties": { + "input_tokens": { + "type": "long" + }, + "output_tokens": { + "type": "long" + }, + "credits": { + "type": "long" + } + } + } + } } } } From cea4eaf0819ba87e7edc2c16622d5ad8ccc891f6 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Wed, 6 Aug 2025 09:02:43 -0600 Subject: [PATCH 12/27] Updated Assistant Mapping --- .../templates/component/so/assistant-mappings.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/salt/elasticsearch/templates/component/so/assistant-mappings.json b/salt/elasticsearch/templates/component/so/assistant-mappings.json index 89a907165..3433acbd6 100644 --- a/salt/elasticsearch/templates/component/so/assistant-mappings.json +++ b/salt/elasticsearch/templates/component/so/assistant-mappings.json @@ -30,6 +30,13 @@ "createTime": { "type": "date" }, + "deletedAt": { + "type": "date" + }, + "tags": { + "ignore_above": 1024, + "type": "keyword" + }, "tool_use_id": { "ignore_above": 1024, "type": "keyword" From fc2d450de04c688a1ade7724aad4aa31fe637c5a Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Tue, 26 Aug 2025 09:16:04 -0600 Subject: [PATCH 13/27] Update Settings The apiKey will be built off of the license rather than a new setting. The model is hardcoded for now at the AI Gateway level. We're going to use the investigationPrompt as a trigger for the feature being visible in the UI but by default will be blank for now. --- salt/soc/defaults.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index c86889be7..f89d9e99f 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -1492,9 +1492,8 @@ soc: branch: main folder: securityonion-normalized assistant: - apiKey: apiUrl: https://onionai-dev.securityonion.net - model: claude-sonnet + investigationPrompt: salt: queueDir: /opt/sensoroni/queue timeoutMs: 45000 From 120e61e45cce0bd6796f819d4b96b5bbb2b7e9a3 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Tue, 26 Aug 2025 16:06:14 -0600 Subject: [PATCH 14/27] ClientParams Removed investigation prompt from module settings and moved to client settings, added enabledInSoc. --- salt/soc/defaults.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index f89d9e99f..d847d1d1b 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -1493,7 +1493,6 @@ soc: folder: securityonion-normalized assistant: apiUrl: https://onionai-dev.securityonion.net - investigationPrompt: salt: queueDir: /opt/sensoroni/queue timeoutMs: 45000 @@ -2544,3 +2543,6 @@ soc: - ' -priv' condition: all of selection_* level: 'high' # info | low | medium | high | critical + assistant: + enabledInSoc: false + investigationPrompt: Investigate Alert ID {socid} \ No newline at end of file From 73776f8d11ac14269bf531cc0c6cefbe548413f3 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Wed, 27 Aug 2025 12:46:19 -0600 Subject: [PATCH 15/27] Cleaning up New ES Indexes --- salt/elasticsearch/defaults.yaml | 74 +++++++++++++++++++ ...ings.json => assistant-chat-mappings.json} | 0 ...ings.json => assistant-chat-settings.json} | 0 .../so/assistant-session-mappings.json | 44 +++++++++++ .../so/assistant-session-settings.json | 7 ++ 5 files changed, 125 insertions(+) rename salt/elasticsearch/templates/component/so/{assistant-mappings.json => assistant-chat-mappings.json} (100%) rename salt/elasticsearch/templates/component/so/{assistant-settings.json => assistant-chat-settings.json} (100%) create mode 100644 salt/elasticsearch/templates/component/so/assistant-session-mappings.json create mode 100644 salt/elasticsearch/templates/component/so/assistant-session-settings.json diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml index 1200701c9..b5031b9b2 100644 --- a/salt/elasticsearch/defaults.yaml +++ b/salt/elasticsearch/defaults.yaml @@ -284,6 +284,80 @@ elasticsearch: hot: actions: {} min_age: 0ms + so-assistant-chat: + index_sorting: false + index_template: + composed_of: + - assistant-chat-mappings + - assistant-chat-settings + ignore_missing_component_templates: [] + index_patterns: + - so-assistant-chat* + priority: 500 + template: + mappings: + date_detection: false + dynamic_templates: + - strings_as_keyword: + mapping: + ignore_above: 1024 + type: keyword + match_mapping_type: string + settings: + index: + lifecycle: + name: so-assistant-chat-logs + mapping: + total_fields: + limit: 1500 + number_of_replicas: 0 + number_of_shards: 1 + refresh_interval: 1s + sort: + field: '@timestamp' + order: desc + policy: + phases: + hot: + actions: {} + min_age: 0ms + so-assistant-session: + index_sorting: false + index_template: + composed_of: + - assistant-session-mappings + - assistant-session-settings + ignore_missing_component_templates: [] + index_patterns: + - so-assistant-session* + priority: 500 + template: + mappings: + date_detection: false + dynamic_templates: + - strings_as_keyword: + mapping: + ignore_above: 1024 + type: keyword + match_mapping_type: string + settings: + index: + lifecycle: + name: so-assistant-session-logs + mapping: + total_fields: + limit: 1500 + number_of_replicas: 0 + number_of_shards: 1 + refresh_interval: 1s + sort: + field: '@timestamp' + order: desc + policy: + phases: + hot: + actions: {} + min_age: 0ms so-endgame: index_sorting: false index_template: diff --git a/salt/elasticsearch/templates/component/so/assistant-mappings.json b/salt/elasticsearch/templates/component/so/assistant-chat-mappings.json similarity index 100% rename from salt/elasticsearch/templates/component/so/assistant-mappings.json rename to salt/elasticsearch/templates/component/so/assistant-chat-mappings.json diff --git a/salt/elasticsearch/templates/component/so/assistant-settings.json b/salt/elasticsearch/templates/component/so/assistant-chat-settings.json similarity index 100% rename from salt/elasticsearch/templates/component/so/assistant-settings.json rename to salt/elasticsearch/templates/component/so/assistant-chat-settings.json diff --git a/salt/elasticsearch/templates/component/so/assistant-session-mappings.json b/salt/elasticsearch/templates/component/so/assistant-session-mappings.json new file mode 100644 index 000000000..b72bbb389 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-session-mappings.json @@ -0,0 +1,44 @@ +{ + "template": { + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "so_kind": { + "ignore_above": 1024, + "type": "keyword" + }, + "so_session": { + "properties": { + "title": { + "ignore_above": 1024, + "type": "keyword" + }, + "sessionId": { + "ignore_above": 1024, + "type": "keyword" + }, + "createTime": { + "type": "date" + }, + "deleteTime": { + "type": "date" + }, + "tags": { + "ignore_above": 1024, + "type": "keyword" + }, + "userId": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + } + }, + "_meta": { + "ecs_version": "1.12.2" + } +} diff --git a/salt/elasticsearch/templates/component/so/assistant-session-settings.json b/salt/elasticsearch/templates/component/so/assistant-session-settings.json new file mode 100644 index 000000000..0281fa0e1 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-session-settings.json @@ -0,0 +1,7 @@ +{ + "template": {}, + "version": 1, + "_meta": { + "description": "default settings for common Security Onion Assistant indices" + } +} From 834e34128d5b3f8d5e0e89176968e89d8464f0d1 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Thu, 28 Aug 2025 16:03:35 -0600 Subject: [PATCH 16/27] Non-dev URL --- salt/soc/defaults.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index d847d1d1b..0e7d8ac99 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -1492,7 +1492,7 @@ soc: branch: main folder: securityonion-normalized assistant: - apiUrl: https://onionai-dev.securityonion.net + apiUrl: https://onionai.securityonion.net salt: queueDir: /opt/sensoroni/queue timeoutMs: 45000 From 0a3ff47008029b369381abf0e4ecc5d7629a7505 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Wed, 3 Sep 2025 12:12:27 -0600 Subject: [PATCH 17/27] Cleanup Annotations Removed fields no longer need annotations. --- salt/soc/soc_soc.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/salt/soc/soc_soc.yaml b/salt/soc/soc_soc.yaml index b8133999f..251a3c037 100644 --- a/salt/soc/soc_soc.yaml +++ b/salt/soc/soc_soc.yaml @@ -581,16 +581,10 @@ soc: label: Folder airgap: *pbRepos assistant: - apiKey: - description: The auth token to be used when reaching out to the AI Assistant. - global: True apiUrl: description: The URL of the AI gateway. advanced: True global: True - model: - description: The model to use as the AI Assistant - global: True client: apiTimeoutMs: description: Duration (in milliseconds) to wait for a response from the SOC server API before giving up and showing an error on the SOC UI. From 673f9cb544682efe5129727e1c490fce03f69748 Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Thu, 4 Sep 2025 09:20:50 -0600 Subject: [PATCH 18/27] Responding to Feedback --- salt/elasticsearch/defaults.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml index b5031b9b2..9eb4b4901 100644 --- a/salt/elasticsearch/defaults.yaml +++ b/salt/elasticsearch/defaults.yaml @@ -292,8 +292,8 @@ elasticsearch: - assistant-chat-settings ignore_missing_component_templates: [] index_patterns: - - so-assistant-chat* - priority: 500 + - so-assistant-chat-* + priority: 501 template: mappings: date_detection: false @@ -329,8 +329,8 @@ elasticsearch: - assistant-session-settings ignore_missing_component_templates: [] index_patterns: - - so-assistant-session* - priority: 500 + - so-assistant-session-* + priority: 501 template: mappings: date_detection: false From 855b489c4b4ead1ca5273136723ba881ed567750 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Thu, 4 Sep 2025 10:39:57 -0500 Subject: [PATCH 19/27] datastream --- salt/elasticsearch/defaults.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml index 9eb4b4901..e51f5ac4e 100644 --- a/salt/elasticsearch/defaults.yaml +++ b/salt/elasticsearch/defaults.yaml @@ -290,6 +290,9 @@ elasticsearch: composed_of: - assistant-chat-mappings - assistant-chat-settings + data_stream: + allow_custom_routing: false + hidden: false ignore_missing_component_templates: [] index_patterns: - so-assistant-chat-* @@ -327,6 +330,9 @@ elasticsearch: composed_of: - assistant-session-mappings - assistant-session-settings + data_stream: + allow_custom_routing: false + hidden: false ignore_missing_component_templates: [] index_patterns: - so-assistant-session-* From 12959d114c103de5050f33081ea94211f869baad Mon Sep 17 00:00:00 2001 From: Matthew Wright Date: Thu, 4 Sep 2025 16:36:51 -0400 Subject: [PATCH 20/27] added threshold config fields for assistant --- salt/soc/defaults.yaml | 8 +++++++- salt/soc/soc_soc.yaml | 30 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index 0e7d8ac99..fe50fced7 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -2545,4 +2545,10 @@ soc: level: 'high' # info | low | medium | high | critical assistant: enabledInSoc: false - investigationPrompt: Investigate Alert ID {socid} \ No newline at end of file + investigationPrompt: Investigate Alert ID {socid} + contextLimitSmall: 200000 + contextLimitLarge: 1000000 + thresholdColorRatioLow: 0.5 + thresholdColorRatioMed: 0.75 + thresholdColorRatioMax: 1 + lowBalanceColorAlert: 500000 \ No newline at end of file diff --git a/salt/soc/soc_soc.yaml b/salt/soc/soc_soc.yaml index 251a3c037..cde5996ee 100644 --- a/salt/soc/soc_soc.yaml +++ b/salt/soc/soc_soc.yaml @@ -586,6 +586,36 @@ soc: advanced: True global: True client: + assistant: + enabledInSoc: + description: Set to true to enable the Onion AI assistant in SOC. + global: True + investigationPrompt: + description: Prompt given to Onion AI when beginning an investigation. + global: True + contextLimitSmall: + description: Smaller context limit for Onion AI. + global: True + advanced: True + contextLimitLarge: + description: Larger context limit for Onion AI. + global: True + advanced: True + thresholdColorRatioLow: + description: Lower visual context color change threshold. + global: True + advanced: True + thresholdColorRatioMed: + description: Middle visual context color change threshold. + global: True + advanced: True + thresholdColorRatioMax: + description: Max visual context color change threshold. + global: True + advanced: True + lowBalanceColorAlert: + description: Onion AI credit amount at which balance turns red. + advanced: True apiTimeoutMs: description: Duration (in milliseconds) to wait for a response from the SOC server API before giving up and showing an error on the SOC UI. global: True From aa43177d8c48af6c4d6170d087689b6dfdfcb24f Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Fri, 5 Sep 2025 11:31:08 -0600 Subject: [PATCH 21/27] Fix Setting Name enabledInSoc => enabled --- salt/soc/defaults.yaml | 2 +- salt/soc/soc_soc.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index fe50fced7..58b3a3827 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -2544,7 +2544,7 @@ soc: condition: all of selection_* level: 'high' # info | low | medium | high | critical assistant: - enabledInSoc: false + enabled: false investigationPrompt: Investigate Alert ID {socid} contextLimitSmall: 200000 contextLimitLarge: 1000000 diff --git a/salt/soc/soc_soc.yaml b/salt/soc/soc_soc.yaml index cde5996ee..4af20d444 100644 --- a/salt/soc/soc_soc.yaml +++ b/salt/soc/soc_soc.yaml @@ -587,7 +587,7 @@ soc: global: True client: assistant: - enabledInSoc: + enabled: description: Set to true to enable the Onion AI assistant in SOC. global: True investigationPrompt: From 2535ae953d517d4eb2bd51b784fe9d21b327a99e Mon Sep 17 00:00:00 2001 From: Corey Ogburn Date: Tue, 9 Sep 2025 14:00:01 -0600 Subject: [PATCH 22/27] Fix Index Patterns so-assistant-chat and so-assistant-session both had templates with a trailing dash that prevented the pattern from applying to the name of the indices. --- salt/elasticsearch/defaults.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml index e51f5ac4e..db4fc0515 100644 --- a/salt/elasticsearch/defaults.yaml +++ b/salt/elasticsearch/defaults.yaml @@ -295,7 +295,7 @@ elasticsearch: hidden: false ignore_missing_component_templates: [] index_patterns: - - so-assistant-chat-* + - so-assistant-chat* priority: 501 template: mappings: @@ -335,7 +335,7 @@ elasticsearch: hidden: false ignore_missing_component_templates: [] index_patterns: - - so-assistant-session-* + - so-assistant-session* priority: 501 template: mappings: From 8f36d2ec007877fa8b70c40f422bd621085ed169 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:38:50 -0500 Subject: [PATCH 23/27] update log file name --- .../integrations/grid-nodes_general/elastic-agent-monitor.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json index a7d425b39..31b004a91 100644 --- a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json +++ b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json @@ -19,7 +19,7 @@ "enabled": true, "vars": { "paths": [ - "/opt/so/log/agents/agent-monitor-*.log" + "/opt/so/log/agents/agent-monitor.log" ], "data_stream.dataset": "agent-monitor", "pipeline": "elasticagent.monitor", From 29980ea95827a316e360031315594fd9ce3144b7 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Tue, 9 Sep 2025 15:39:55 -0500 Subject: [PATCH 24/27] offline threshold check --- salt/manager/tools/sbin_jinja/so-elastic-agent-monitor | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor index 0b40925fd..3fa3221c2 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -145,6 +145,11 @@ main() { offline_hours=$(calculate_offline_hours "$last_checkin") + if [ "$offline_hours" -lt "$OFFLINE_THRESHOLD_HOURS" ]; then + log_message "INFO" "${agent_hostname^^} has been offline for ${offline_hours}h (threshold: ${OFFLINE_THRESHOLD_HOURS}h). Not logging ${agent_status^^} agent until it reaches threshold" + continue + fi + log_entry=$(echo 'null' | jq -c \ --arg ts "$current_timestamp" \ --arg id "$agent_id" \ From 4107fa006ff561b8f0d87328c960e1ecac5989ee Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Tue, 9 Sep 2025 16:51:42 -0400 Subject: [PATCH 25/27] fix repo files to remove --- salt/repo/client/map.jinja | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/salt/repo/client/map.jinja b/salt/repo/client/map.jinja index 94228c300..2c040c3c5 100644 --- a/salt/repo/client/map.jinja +++ b/salt/repo/client/map.jinja @@ -26,9 +26,9 @@ 'rocky-devel.repo', 'rocky-extras.repo', 'rocky.repo', - 'oracle-linux-ol9', - 'uek-ol9', - 'virt-oll9' + 'oracle-linux-ol9.repo', + 'uek-ol9.repo', + 'virt-ol9.repo' ] %} {% else %} From f5ec1d4b7c4fce5f04ca18b1c807f555fee7f448 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Wed, 10 Sep 2025 09:09:02 -0400 Subject: [PATCH 26/27] don't show sensoroni config changes --- salt/sensoroni/config.sls | 1 + 1 file changed, 1 insertion(+) diff --git a/salt/sensoroni/config.sls b/salt/sensoroni/config.sls index 8298209f1..225d0ddb4 100644 --- a/salt/sensoroni/config.sls +++ b/salt/sensoroni/config.sls @@ -18,6 +18,7 @@ sensoroniagentconf: - group: 939 - mode: 600 - template: jinja + - show_changes: False analyzersdir: file.directory: From fbdc0c470570c6cd30a98ad6898e8c8ce4828959 Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Wed, 10 Sep 2025 10:56:09 -0500 Subject: [PATCH 27/27] add configurable realert threshold per agent --- salt/manager/defaults.yaml | 1 + salt/manager/soc_manager.yaml | 5 ++ .../tools/sbin_jinja/so-elastic-agent-monitor | 54 +++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml index 237ac2999..239075f74 100644 --- a/salt/manager/defaults.yaml +++ b/salt/manager/defaults.yaml @@ -11,5 +11,6 @@ manager: critical_agents: [] custom_kquery: offline_threshold: 5 + realert_threshold: 5 page_size: 250 run_interval: 5 diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml index ac06ac2b4..f0d699f58 100644 --- a/salt/manager/soc_manager.yaml +++ b/salt/manager/soc_manager.yaml @@ -61,6 +61,11 @@ manager: global: True helpLink: elastic-fleet.html forcedType: int + realert_threshold: + description: The time to pass before another alert for an offline agent exceeding the offline_threshold is generated. + global: True + helpLink: elastic-fleet.html + forcedType: int page_size: description: The amount of agents that can be processed per API request to fleet. global: True diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor index 3fa3221c2..f8c3162b4 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -2,6 +2,7 @@ {%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%} {%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%} {%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%} +{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%} #!/bin/bash set -euo pipefail @@ -17,6 +18,7 @@ CRITICAL_AGENTS_FILE="/dev/null" CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" {%- endif %} OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} +REALERT_THRESHOLD={{ REALERT_THRESHOLD }} PAGE_SIZE="{{ PAGE_SIZE }}" log_message() { @@ -71,6 +73,52 @@ calculate_offline_hours() { echo $((diff / 3600)) } +check_recent_log_entries() { + local agent_hostname="$1" + + if [ ! -f "$LOG_FILE" ]; then + return 1 + fi + + local current_time=$(date +%s) + local threshold_seconds=$((REALERT_THRESHOLD * 3600)) + local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]') + local most_recent_timestamp="" + + while IFS= read -r line; do + [ -z "$line" ] && continue + + local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null) + local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null) + + [ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue + + local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]') + + if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then + most_recent_timestamp="$logged_timestamp" + fi + done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null) + + # If there is agent entry (within last 1000), check the time difference + if [ -n "$most_recent_timestamp" ]; then + local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0") + + if [ "$logged_time" -ne "0" ]; then + local time_diff=$((current_time - logged_time)) + local hours_diff=$((time_diff / 3600)) + + # Skip if last agent timestamp was more recent than realert threshold + if ((hours_diff < REALERT_THRESHOLD)); then + return 0 + fi + fi + fi + + # Agent has not been logged within realert threshold + return 1 +} + main() { log_message "INFO" "Starting Fleet agent status check" @@ -150,6 +198,12 @@ main() { continue fi + # Check if this agent was already logged within the realert_threshold + if check_recent_log_entries "$agent_hostname"; then + log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h" + continue + fi + log_entry=$(echo 'null' | jq -c \ --arg ts "$current_timestamp" \ --arg id "$agent_id" \