diff --git a/salt/elasticfleet/defaults.yaml b/salt/elasticfleet/defaults.yaml index d6cdd7351..0220428bf 100644 --- a/salt/elasticfleet/defaults.yaml +++ b/salt/elasticfleet/defaults.yaml @@ -38,6 +38,7 @@ elasticfleet: - elasticsearch - endpoint - fleet_server + - filestream - http_endpoint - httpjson - log diff --git a/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json new file mode 100644 index 000000000..31b004a91 --- /dev/null +++ b/salt/elasticfleet/files/integrations/grid-nodes_general/elastic-agent-monitor.json @@ -0,0 +1,48 @@ +{ + "package": { + "name": "filestream", + "version": "" + }, + "name": "agent-monitor", + "namespace": "", + "description": "", + "policy_ids": [ + "so-grid-nodes_general" + ], + "output_id": null, + "vars": {}, + "inputs": { + "filestream-filestream": { + "enabled": true, + "streams": { + "filestream.generic": { + "enabled": true, + "vars": { + "paths": [ + "/opt/so/log/agents/agent-monitor.log" + ], + "data_stream.dataset": "agent-monitor", + "pipeline": "elasticagent.monitor", + "parsers": "", + "exclude_files": [ + "\\.gz$" + ], + "include_files": [], + "processors": "- decode_json_fields:\n fields: [\"message\"]\n target: \"\"\n- add_fields:\n target: event\n fields:\n module: gridmetrics", + "tags": [], + "recursive_glob": true, + "ignore_older": "72h", + "clean_inactive": -1, + "harvester_limit": 0, + "fingerprint": true, + "fingerprint_offset": 0, + "fingerprint_length": 1024, + "file_identity_native": false, + "exclude_lines": [], + "include_lines": [] + } + } + } + } + } +} diff --git a/salt/elasticsearch/defaults.yaml b/salt/elasticsearch/defaults.yaml index 8224a2450..db4fc0515 100644 --- a/salt/elasticsearch/defaults.yaml +++ b/salt/elasticsearch/defaults.yaml @@ -284,6 +284,86 @@ elasticsearch: hot: actions: {} min_age: 0ms + so-assistant-chat: + index_sorting: false + index_template: + composed_of: + - assistant-chat-mappings + - assistant-chat-settings + data_stream: + allow_custom_routing: false + hidden: false + ignore_missing_component_templates: [] + index_patterns: + - so-assistant-chat* + priority: 501 + template: + mappings: + date_detection: false + dynamic_templates: + - strings_as_keyword: + mapping: + ignore_above: 1024 + type: keyword + match_mapping_type: string + settings: + index: + lifecycle: + name: so-assistant-chat-logs + mapping: + total_fields: + limit: 1500 + number_of_replicas: 0 + number_of_shards: 1 + refresh_interval: 1s + sort: + field: '@timestamp' + order: desc + policy: + phases: + hot: + actions: {} + min_age: 0ms + so-assistant-session: + index_sorting: false + index_template: + composed_of: + - assistant-session-mappings + - assistant-session-settings + data_stream: + allow_custom_routing: false + hidden: false + ignore_missing_component_templates: [] + index_patterns: + - so-assistant-session* + priority: 501 + template: + mappings: + date_detection: false + dynamic_templates: + - strings_as_keyword: + mapping: + ignore_above: 1024 + type: keyword + match_mapping_type: string + settings: + index: + lifecycle: + name: so-assistant-session-logs + mapping: + total_fields: + limit: 1500 + number_of_replicas: 0 + number_of_shards: 1 + refresh_interval: 1s + sort: + field: '@timestamp' + order: desc + policy: + phases: + hot: + actions: {} + min_age: 0ms so-endgame: index_sorting: false index_template: @@ -1243,6 +1323,70 @@ elasticsearch: set_priority: priority: 50 min_age: 30d + so-logs-agent-monitor: + index_sorting: false + index_template: + composed_of: + - event-mappings + - so-elastic-agent-monitor + - so-fleet_integrations.ip_mappings-1 + - so-fleet_globals-1 + - so-fleet_agent_id_verification-1 + data_stream: + allow_custom_routing: false + hidden: false + ignore_missing_component_templates: + - logs-agent-monitor@custom + index_patterns: + - logs-agent-monitor-* + priority: 501 + template: + mappings: + _meta: + managed: true + managed_by: security_onion + package: + name: elastic_agent + settings: + index: + lifecycle: + name: so-logs-agent-monitor-logs + mapping: + total_fields: + limit: 5000 + number_of_replicas: 0 + sort: + field: '@timestamp' + order: desc + policy: + _meta: + managed: true + managed_by: security_onion + package: + name: elastic_agent + phases: + cold: + actions: + set_priority: + priority: 0 + min_age: 60d + delete: + actions: + delete: {} + min_age: 365d + hot: + actions: + rollover: + max_age: 30d + max_primary_shard_size: 50gb + set_priority: + priority: 100 + min_age: 0ms + warm: + actions: + set_priority: + priority: 50 + min_age: 30d so-logs-elastic_agent_x_apm_server: index_sorting: false index_template: diff --git a/salt/elasticsearch/files/ingest/elasticagent.monitor b/salt/elasticsearch/files/ingest/elasticagent.monitor new file mode 100644 index 000000000..09d8297c4 --- /dev/null +++ b/salt/elasticsearch/files/ingest/elasticagent.monitor @@ -0,0 +1,36 @@ +{ + "processors": [ + { + "set": { + "field": "event.dataset", + "value": "gridmetrics.agents", + "ignore_failure": true + } + }, + { + "set": { + "field": "event.module", + "value": "gridmetrics", + "ignore_failure": true + } + }, + { + "remove": { + "field": [ + "host", + "elastic_agent", + "agent" + ], + "ignore_missing": true, + "ignore_failure": true + } + }, + { + "json": { + "field": "message", + "add_to_root": true, + "ignore_failure": true + } + } + ] +} \ No newline at end of file diff --git a/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json new file mode 100644 index 000000000..50440fbed --- /dev/null +++ b/salt/elasticsearch/templates/component/elastic-agent/so-elastic-agent-monitor.json @@ -0,0 +1,43 @@ +{ + "template": { + "mappings": { + "properties": { + "agent": { + "type": "object", + "properties": { + "hostname": { + "ignore_above": 1024, + "type": "keyword" + }, + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "last_checkin_status": { + "ignore_above": 1024, + "type": "keyword" + }, + "last_checkin": { + "type": "date" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "offline_duration_hours": { + "type": "integer" + }, + "policy_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "status": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + } + } +} \ No newline at end of file diff --git a/salt/elasticsearch/templates/component/so/assistant-chat-mappings.json b/salt/elasticsearch/templates/component/so/assistant-chat-mappings.json new file mode 100644 index 000000000..3433acbd6 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-chat-mappings.json @@ -0,0 +1,104 @@ +{ + "template": { + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "so_kind": { + "ignore_above": 1024, + "type": "keyword" + }, + "so_operation": { + "ignore_above": 1024, + "type": "keyword" + }, + "so_chat": { + "properties": { + "role": { + "ignore_above": 1024, + "type": "keyword" + }, + "content": { + "type": "object", + "enabled": false + }, + "sessionId": { + "ignore_above": 1024, + "type": "keyword" + }, + "createTime": { + "type": "date" + }, + "deletedAt": { + "type": "date" + }, + "tags": { + "ignore_above": 1024, + "type": "keyword" + }, + "tool_use_id": { + "ignore_above": 1024, + "type": "keyword" + }, + "userId": { + "ignore_above": 1024, + "type": "keyword" + }, + "message": { + "properties": { + "id": { + "ignore_above": 1024, + "type": "keyword" + }, + "type": { + "ignore_above": 1024, + "type": "keyword" + }, + "role": { + "ignore_above": 1024, + "type": "keyword" + }, + "model": { + "ignore_above": 1024, + "type": "keyword" + }, + "contentStr": { + "type": "text" + }, + "contentBlocks": { + "type": "nested", + "enabled": false + }, + "stopReason": { + "ignore_above": 1024, + "type": "keyword" + }, + "stopSequence": { + "ignore_above": 1024, + "type": "keyword" + }, + "usage": { + "properties": { + "input_tokens": { + "type": "long" + }, + "output_tokens": { + "type": "long" + }, + "credits": { + "type": "long" + } + } + } + } + } + } + } + } + } + }, + "_meta": { + "ecs_version": "1.12.2" + } +} diff --git a/salt/elasticsearch/templates/component/so/assistant-chat-settings.json b/salt/elasticsearch/templates/component/so/assistant-chat-settings.json new file mode 100644 index 000000000..0281fa0e1 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-chat-settings.json @@ -0,0 +1,7 @@ +{ + "template": {}, + "version": 1, + "_meta": { + "description": "default settings for common Security Onion Assistant indices" + } +} diff --git a/salt/elasticsearch/templates/component/so/assistant-session-mappings.json b/salt/elasticsearch/templates/component/so/assistant-session-mappings.json new file mode 100644 index 000000000..b72bbb389 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-session-mappings.json @@ -0,0 +1,44 @@ +{ + "template": { + "mappings": { + "properties": { + "@timestamp": { + "type": "date" + }, + "so_kind": { + "ignore_above": 1024, + "type": "keyword" + }, + "so_session": { + "properties": { + "title": { + "ignore_above": 1024, + "type": "keyword" + }, + "sessionId": { + "ignore_above": 1024, + "type": "keyword" + }, + "createTime": { + "type": "date" + }, + "deleteTime": { + "type": "date" + }, + "tags": { + "ignore_above": 1024, + "type": "keyword" + }, + "userId": { + "ignore_above": 1024, + "type": "keyword" + } + } + } + } + } + }, + "_meta": { + "ecs_version": "1.12.2" + } +} diff --git a/salt/elasticsearch/templates/component/so/assistant-session-settings.json b/salt/elasticsearch/templates/component/so/assistant-session-settings.json new file mode 100644 index 000000000..0281fa0e1 --- /dev/null +++ b/salt/elasticsearch/templates/component/so/assistant-session-settings.json @@ -0,0 +1,7 @@ +{ + "template": {}, + "version": 1, + "_meta": { + "description": "default settings for common Security Onion Assistant indices" + } +} diff --git a/salt/firewall/defaults.yaml b/salt/firewall/defaults.yaml index 0c43b8c0b..a11492e88 100644 --- a/salt/firewall/defaults.yaml +++ b/salt/firewall/defaults.yaml @@ -1230,6 +1230,10 @@ firewall: portgroups: - elasticsearch_node - elasticsearch_rest + managerhype: + portgroups: + - elasticsearch_node + - elasticsearch_rest standalone: portgroups: - elasticsearch_node @@ -1377,6 +1381,10 @@ firewall: portgroups: - elasticsearch_node - elasticsearch_rest + managerhype: + portgroups: + - elasticsearch_node + - elasticsearch_rest standalone: portgroups: - elasticsearch_node @@ -1579,6 +1587,9 @@ firewall: portgroups: - redis - elastic_agent_data + managerhype: + portgroups: + - elastic_agent_data self: portgroups: - redis @@ -1696,6 +1707,9 @@ firewall: managersearch: portgroups: - openssh + managerhype: + portgroups: + - openssh standalone: portgroups: - openssh @@ -1758,6 +1772,8 @@ firewall: portgroups: [] managersearch: portgroups: [] + managerhype: + portgroups: [] standalone: portgroups: [] customhostgroup0: diff --git a/salt/firewall/map.jinja b/salt/firewall/map.jinja index 4347d2b31..8bd0512ec 100644 --- a/salt/firewall/map.jinja +++ b/salt/firewall/map.jinja @@ -25,7 +25,7 @@ {% set KAFKA_EXTERNAL_ACCESS = salt['pillar.get']('kafka:config:external_access:enabled', default=False) %} {% set kafka_node_type = salt['pillar.get']('kafka:nodes:'+ GLOBALS.hostname + ':role') %} -{% if role in ['manager', 'managersearch', 'standalone'] %} +{% if role.startswith('manager') or role == 'standalone' %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[role].portgroups.append('kafka_controller') %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.receiver.portgroups.append('kafka_controller') %} {% endif %} @@ -38,8 +38,8 @@ {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.receiver.portgroups.append('kafka_controller') %} {% endif %} -{% if role in ['manager', 'managersearch', 'standalone', 'receiver'] %} -{% for r in ['manager', 'managersearch', 'standalone', 'receiver', 'fleet', 'idh', 'sensor', 'searchnode','heavynode', 'elastic_agent_endpoint', 'desktop'] %} +{% if role.startswith('manager') or role in ['standalone', 'receiver'] %} +{% for r in ['manager', 'managersearch', 'managerhype', 'standalone', 'receiver', 'fleet', 'idh', 'sensor', 'searchnode','heavynode', 'elastic_agent_endpoint', 'desktop'] %} {% if FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r] is defined %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r].portgroups.append('kafka_data') %} {% endif %} @@ -48,11 +48,11 @@ {% if KAFKA_EXTERNAL_ACCESS %} {# Kafka external access only applies for Kafka nodes with the broker role. #} -{% if role in ['manager', 'managersearch', 'standalone', 'receiver'] and 'broker' in kafka_node_type %} +{% if role.startswith('manager') or role in ['standalone', 'receiver'] and 'broker' in kafka_node_type %} {% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups.external_kafka.portgroups.append('kafka_external_access') %} {% endif %} {% endif %} {% endif %} -{% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %} \ No newline at end of file +{% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %} diff --git a/salt/logrotate/defaults.yaml b/salt/logrotate/defaults.yaml index 2f7247ff2..479b598f5 100644 --- a/salt/logrotate/defaults.yaml +++ b/salt/logrotate/defaults.yaml @@ -268,3 +268,12 @@ logrotate: - nocompress - create - sharedscripts + /opt/so/log/agents/agent-monitor*_x_log: + - daily + - rotate 14 + - missingok + - compress + - create + - extension .log + - dateext + - dateyesterday \ No newline at end of file diff --git a/salt/logrotate/soc_logrotate.yaml b/salt/logrotate/soc_logrotate.yaml index 56f879e4f..6f0272ef0 100644 --- a/salt/logrotate/soc_logrotate.yaml +++ b/salt/logrotate/soc_logrotate.yaml @@ -175,3 +175,10 @@ logrotate: multiline: True global: True forcedType: "[]string" + "/opt/so/log/agents/agent-monitor*_x_log": + description: List of logrotate options for this file. + title: /opt/so/log/agents/agent-monitor*.log + advanced: True + multiline: True + global: True + forcedType: "[]string" diff --git a/salt/logstash/map.jinja b/salt/logstash/map.jinja index 95ec6b85d..5aad1daa9 100644 --- a/salt/logstash/map.jinja +++ b/salt/logstash/map.jinja @@ -17,7 +17,7 @@ {% for node_type, node_details in redis_node_data.items() | sort %} {% if GLOBALS.role in ['so-searchnode', 'so-standalone', 'so-managersearch', 'so-fleet'] %} -{% if node_type in ['manager', 'managersearch', 'standalone', 'receiver' ] %} +{% if node_type.startswith('manager') or node_type in ['standalone', 'receiver'] %} {% for hostname in redis_node_data[node_type].keys() %} {% do LOGSTASH_REDIS_NODES.append({hostname:node_details[hostname].ip}) %} {% endfor %} @@ -47,7 +47,7 @@ {% endif %} {# Disable logstash on manager & receiver nodes unless it has an override configured #} {% if not KAFKA_LOGSTASH %} -{% if GLOBALS.role in ['so-manager', 'so-receiver'] and GLOBALS.hostname not in KAFKA_LOGSTASH %} +{% if GLOBALS.role in ['so-manager', 'so-managerhype', 'so-receiver'] and GLOBALS.hostname not in KAFKA_LOGSTASH %} {% do LOGSTASH_MERGED.update({'enabled': False}) %} {% endif %} {% endif %} diff --git a/salt/manager/defaults.yaml b/salt/manager/defaults.yaml index 708900af6..239075f74 100644 --- a/salt/manager/defaults.yaml +++ b/salt/manager/defaults.yaml @@ -5,3 +5,12 @@ manager: minute: 0 additionalCA: '' insecureSkipVerify: False + agent_monitoring: + enabled: False + config: + critical_agents: [] + custom_kquery: + offline_threshold: 5 + realert_threshold: 5 + page_size: 250 + run_interval: 5 diff --git a/salt/manager/init.sls b/salt/manager/init.sls index 737d753f4..f59c33652 100644 --- a/salt/manager/init.sls +++ b/salt/manager/init.sls @@ -34,6 +34,26 @@ agents_log_dir: - user - group +agents_conf_dir: + file.directory: + - name: /opt/so/conf/agents + - user: root + - group: root + - recurse: + - user + - group + +{% if MANAGERMERGED.agent_monitoring.config.critical_agents | length > 0 %} +critical_agents_patterns: + file.managed: + - name: /opt/so/conf/agents/critical-agents.txt + - contents: {{ MANAGERMERGED.agent_monitoring.config.critical_agents }} +{% else %} +remove_critical_agents_config: + file.absent: + - name: /opt/so/conf/agents/critical-agents.txt +{% endif %} + yara_log_dir: file.directory: - name: /opt/so/log/yarasync @@ -127,6 +147,21 @@ so_fleetagent_status: - month: '*' - dayweek: '*' +so_fleetagent_monitor: +{% if MANAGERMERGED.agent_monitoring.enabled %} + cron.present: +{% else %} + cron.absent: +{% endif %} + - name: /bin/flock -n /opt/so/log/agents/agent-monitor.lock /usr/sbin/so-elastic-agent-monitor + - identifier: so_fleetagent_monitor + - user: root + - minute: '*/{{ MANAGERMERGED.agent_monitoring.config.run_interval }}' + - hour: '*' + - daymonth: '*' + - month: '*' + - dayweek: '*' + socore_own_saltstack_default: file.directory: - name: /opt/so/saltstack/default diff --git a/salt/manager/soc_manager.yaml b/salt/manager/soc_manager.yaml index cf78658de..f0d699f58 100644 --- a/salt/manager/soc_manager.yaml +++ b/salt/manager/soc_manager.yaml @@ -37,3 +37,44 @@ manager: forcedType: bool global: True helpLink: proxy.html + agent_monitoring: + enabled: + description: Enable monitoring elastic agents for health issues. Can be used to trigger an alert when a 'critical' agent hasn't checked in with fleet for longer than the configured offline threshold. + global: True + helpLink: elastic-fleet.html + forcedType: bool + config: + critical_agents: + description: List of 'critical' agents to log when they haven't checked in longer than the maximum allowed time. If there are no 'critical' agents specified all offline agents will be logged once they reach the offline threshold. + global: True + multiline: True + helpLink: elastic-fleet.html + forcedType: "[]string" + custom_kquery: + description: For more granular control over what agents to monitor for offline|degraded status add a kquery here. It is recommended to create & test within Elastic Fleet first to ensure your agents are targeted correctly using the query. eg 'status:offline AND tags:INFRA' + global: True + helpLink: elastic-fleet.html + forcedType: string + advanced: True + offline_threshold: + description: The maximum allowed time in hours a 'critical' agent has been offline before being logged. + global: True + helpLink: elastic-fleet.html + forcedType: int + realert_threshold: + description: The time to pass before another alert for an offline agent exceeding the offline_threshold is generated. + global: True + helpLink: elastic-fleet.html + forcedType: int + page_size: + description: The amount of agents that can be processed per API request to fleet. + global: True + helpLink: elastic-fleet.html + forcedType: int + advanced: True + run_interval: + description: The time in minutes between checking fleet agent statuses. + global: True + advanced: True + helpLink: elastic-fleet.html + forcedType: int diff --git a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor new file mode 100644 index 000000000..f8c3162b4 --- /dev/null +++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor @@ -0,0 +1,254 @@ +{%- from 'manager/map.jinja' import MANAGERMERGED -%} +{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%} +{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%} +{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%} +{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%} +#!/bin/bash + +set -euo pipefail + +LOG_DIR="/opt/so/log/agents" +LOG_FILE="$LOG_DIR/agent-monitor.log" +CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config" +FLEET_API="http://localhost:5601/api/fleet/agents" +{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #} +{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %} +CRITICAL_AGENTS_FILE="/dev/null" +{%- else %} +CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt" +{%- endif %} +OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }} +REALERT_THRESHOLD={{ REALERT_THRESHOLD }} +PAGE_SIZE="{{ PAGE_SIZE }}" + +log_message() { + local level="$1" + local message="$2" + echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2 +} + +matches_critical_pattern() { + local hostname="$1" + local pattern_file="$2" + + # If critical agents file doesn't exist or is empty, match all + if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then + return 0 + fi + + local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]') + + while IFS= read -r pattern || [ -n "$pattern" ]; do + # empty lines and comments + [[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue + + # cut whitespace + pattern=$(echo "$pattern" | xargs) + + local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]') + + # Replace * with bash wildcard + local bash_pattern="${pattern_lower//\*/.*}" + + # Check if hostname matches the pattern + if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then + return 0 + fi + done < "$pattern_file" + + return 1 +} + +calculate_offline_hours() { + local last_checkin="$1" + local current_time=$(date +%s) + local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0") + + if [ "$checkin_time" -eq "0" ]; then + echo "0" + return + fi + + local diff=$((current_time - checkin_time)) + echo $((diff / 3600)) +} + +check_recent_log_entries() { + local agent_hostname="$1" + + if [ ! -f "$LOG_FILE" ]; then + return 1 + fi + + local current_time=$(date +%s) + local threshold_seconds=$((REALERT_THRESHOLD * 3600)) + local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]') + local most_recent_timestamp="" + + while IFS= read -r line; do + [ -z "$line" ] && continue + + local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null) + local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null) + + [ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue + + local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]') + + if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then + most_recent_timestamp="$logged_timestamp" + fi + done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null) + + # If there is agent entry (within last 1000), check the time difference + if [ -n "$most_recent_timestamp" ]; then + local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0") + + if [ "$logged_time" -ne "0" ]; then + local time_diff=$((current_time - logged_time)) + local hours_diff=$((time_diff / 3600)) + + # Skip if last agent timestamp was more recent than realert threshold + if ((hours_diff < REALERT_THRESHOLD)); then + return 0 + fi + fi + fi + + # Agent has not been logged within realert threshold + return 1 +} + +main() { + log_message "INFO" "Starting Fleet agent status check" + + # Check if critical agents file is configured + if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then + log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE" + log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')" + else + log_message "INFO" "No critical agents filter found, monitoring all agents" + fi + + log_message "INFO" "Querying Fleet API" + + local page=1 + local total_agents=0 + local processed_agents=0 + local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + + {%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %} + log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}" + FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}" + {%- else %} + log_message "INFO" "Using default query (all offline or degraded agents)" + FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}" + {%- endif %} + + while true; do + log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)" + + if ! response_body=$(curl -K "$CURL_CONFIG" \ + -s --fail \ + "$FLEET_QUERY" \ + -H 'kbn-xsrf: true' 2>/dev/null); then + log_message "ERROR" "Failed to query Fleet API (page $page)" + exit 1 + fi + + # pagination info + current_total=$(echo "$response_body" | jq -r '.total // 0') + current_page=$(echo "$response_body" | jq -r '.page // 1') + agents_in_page=$(echo "$response_body" | jq -r '.list | length') + + # Update total + if [ "$page" -eq 1 ]; then + total_agents="$current_total" + log_message "INFO" "Found $total_agents total agents across all pages" + fi + + log_message "INFO" "Processing page $current_page with $agents_in_page agents" + + # Process agents from current page + mapfile -t agents < <(echo "$response_body" | jq -c '.list[]') + + for agent in "${agents[@]}"; do + # Grab agent details + agent_id=$(echo "$agent" | jq -r '.id // "unknown"') + agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"') + agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"') + agent_status=$(echo "$agent" | jq -r '.status // "unknown"') + last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""') + last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"') + policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"') + + # Only log agents that are offline or degraded (skip inactive agents) + # Fleetserver agents can show multiple versions as 'inactive' + if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then + # Check if agent matches critical agent patterns (if configured) + if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then + log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent" + continue # Skip this agent if it doesn't match any critical agent pattern + fi + + offline_hours=$(calculate_offline_hours "$last_checkin") + + if [ "$offline_hours" -lt "$OFFLINE_THRESHOLD_HOURS" ]; then + log_message "INFO" "${agent_hostname^^} has been offline for ${offline_hours}h (threshold: ${OFFLINE_THRESHOLD_HOURS}h). Not logging ${agent_status^^} agent until it reaches threshold" + continue + fi + + # Check if this agent was already logged within the realert_threshold + if check_recent_log_entries "$agent_hostname"; then + log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h" + continue + fi + + log_entry=$(echo 'null' | jq -c \ + --arg ts "$current_timestamp" \ + --arg id "$agent_id" \ + --arg hostname "$agent_hostname" \ + --arg name "$agent_name" \ + --arg status "$agent_status" \ + --arg last_checkin "$last_checkin" \ + --arg last_checkin_status "$last_checkin_status" \ + --arg policy_id "$policy_id" \ + --arg offline_hours "$offline_hours" \ + '{ + "@timestamp": $ts, + "agent.id": $id, + "agent.hostname": $hostname, + "agent.name": $name, + "agent.status": $status, + "agent.last_checkin": $last_checkin, + "agent.last_checkin_status": $last_checkin_status, + "agent.policy_id": $policy_id, + "agent.offline_duration_hours": ($offline_hours | tonumber) + }') + + echo "$log_entry" >> "$LOG_FILE" + + log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)" + fi + done + + processed_agents=$((processed_agents + agents_in_page)) + + if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then + log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents" + break + fi + + page=$((page + 1)) + + # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size + if [ "$page" -gt 100 ]; then + log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size" + break + fi + done + + log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents" +} + +main "$@" diff --git a/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset b/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset index 0b116564d..1e32268da 100644 --- a/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset +++ b/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset @@ -15,6 +15,7 @@ require_manager echo echo "This script will remove the current Elastic Fleet install and all of its data and then rerun Elastic Fleet setup." echo "Deployed Elastic Agents will no longer be enrolled and will need to be reinstalled." +echo "Only the Elastic Fleet instance on the Manager will be reinstalled - dedicated Fleet node config will removed and will need to be reinstalled." echo "This script should only be used as a last resort to reinstall Elastic Fleet." echo echo "If you would like to proceed, then type AGREE and press ENTER." diff --git a/salt/nginx/etc/nginx.conf b/salt/nginx/etc/nginx.conf index 742f5d08d..caa05bbff 100644 --- a/salt/nginx/etc/nginx.conf +++ b/salt/nginx/etc/nginx.conf @@ -196,19 +196,23 @@ http { } location / { - auth_request /auth/sessions/whoami; - auth_request_set $userid $upstream_http_x_kratos_authenticated_identity_id; - proxy_set_header x-user-id $userid; - proxy_pass http://{{ GLOBALS.manager }}:9822/; - proxy_read_timeout 300; - proxy_connect_timeout 300; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header Proxy ""; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "Upgrade"; - proxy_set_header X-Forwarded-Proto $scheme; + auth_request /auth/sessions/whoami; + auth_request_set $userid $upstream_http_x_kratos_authenticated_identity_id; + proxy_set_header x-user-id $userid; + proxy_pass http://{{ GLOBALS.manager }}:9822/; + proxy_read_timeout 300; + proxy_connect_timeout 300; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Proxy ""; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "Upgrade"; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_buffering off; + proxy_cache off; + proxy_request_buffering off; } location ~ ^/auth/.*?(login|oidc/callback) { diff --git a/salt/repo/client/map.jinja b/salt/repo/client/map.jinja index 94228c300..2c040c3c5 100644 --- a/salt/repo/client/map.jinja +++ b/salt/repo/client/map.jinja @@ -26,9 +26,9 @@ 'rocky-devel.repo', 'rocky-extras.repo', 'rocky.repo', - 'oracle-linux-ol9', - 'uek-ol9', - 'virt-oll9' + 'oracle-linux-ol9.repo', + 'uek-ol9.repo', + 'virt-ol9.repo' ] %} {% else %} diff --git a/salt/sensoroni/config.sls b/salt/sensoroni/config.sls index 8298209f1..225d0ddb4 100644 --- a/salt/sensoroni/config.sls +++ b/salt/sensoroni/config.sls @@ -18,6 +18,7 @@ sensoroniagentconf: - group: 939 - mode: 600 - template: jinja + - show_changes: False analyzersdir: file.directory: diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index 7bb2c1f03..58b3a3827 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -1491,6 +1491,8 @@ soc: - repo: file:///nsm/airgap-resources/playbooks/securityonion-resources-playbooks branch: main folder: securityonion-normalized + assistant: + apiUrl: https://onionai.securityonion.net salt: queueDir: /opt/sensoroni/queue timeoutMs: 45000 @@ -2541,3 +2543,12 @@ soc: - ' -priv' condition: all of selection_* level: 'high' # info | low | medium | high | critical + assistant: + enabled: false + investigationPrompt: Investigate Alert ID {socid} + contextLimitSmall: 200000 + contextLimitLarge: 1000000 + thresholdColorRatioLow: 0.5 + thresholdColorRatioMed: 0.75 + thresholdColorRatioMax: 1 + lowBalanceColorAlert: 500000 \ No newline at end of file diff --git a/salt/soc/soc_soc.yaml b/salt/soc/soc_soc.yaml index 2d0eb3792..4af20d444 100644 --- a/salt/soc/soc_soc.yaml +++ b/salt/soc/soc_soc.yaml @@ -580,7 +580,42 @@ soc: - field: folder label: Folder airgap: *pbRepos + assistant: + apiUrl: + description: The URL of the AI gateway. + advanced: True + global: True client: + assistant: + enabled: + description: Set to true to enable the Onion AI assistant in SOC. + global: True + investigationPrompt: + description: Prompt given to Onion AI when beginning an investigation. + global: True + contextLimitSmall: + description: Smaller context limit for Onion AI. + global: True + advanced: True + contextLimitLarge: + description: Larger context limit for Onion AI. + global: True + advanced: True + thresholdColorRatioLow: + description: Lower visual context color change threshold. + global: True + advanced: True + thresholdColorRatioMed: + description: Middle visual context color change threshold. + global: True + advanced: True + thresholdColorRatioMax: + description: Max visual context color change threshold. + global: True + advanced: True + lowBalanceColorAlert: + description: Onion AI credit amount at which balance turns red. + advanced: True apiTimeoutMs: description: Duration (in milliseconds) to wait for a response from the SOC server API before giving up and showing an error on the SOC UI. global: True diff --git a/setup/so-functions b/setup/so-functions index dbe198958..9ab11a904 100755 --- a/setup/so-functions +++ b/setup/so-functions @@ -29,46 +29,8 @@ title() { } fail_setup() { - local failure_reason="${1:-Unknown failure}" - - # Capture call stack information - local calling_function="${FUNCNAME[1]:-main}" - local calling_line="${BASH_LINENO[0]:-unknown}" - local calling_file="${BASH_SOURCE[1]:-unknown}" - - # Build call stack trace - local call_stack="" - local i=1 - while [[ $i -lt ${#FUNCNAME[@]} ]]; do - local func="${FUNCNAME[$i]}" - local file="${BASH_SOURCE[$i]##*/}" # Get basename only - local line="${BASH_LINENO[$((i-1))]}" - - if [[ -n "$call_stack" ]]; then - call_stack="$call_stack -> " - fi - call_stack="$call_stack$func($file:$line)" - ((i++)) - done - - # Enhanced error logging with call stack - error "FAILURE: Called from $calling_function() at line $calling_line" - error "REASON: $failure_reason" - error "STACK: $call_stack" - error "Setup encountered an unrecoverable failure: $failure_reason" - - # Create detailed failure file with enhanced information - { - echo "SETUP_FAILURE_TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" - echo "SETUP_FAILURE_REASON=$failure_reason" - echo "SETUP_CALLING_FUNCTION=$calling_function" - echo "SETUP_CALLING_LINE=$calling_line" - echo "SETUP_CALLING_FILE=${calling_file##*/}" - echo "SETUP_CALL_STACK=$call_stack" - echo "SETUP_LOG_LOCATION=$setup_log" - echo "SETUP_FAILURE_DETAILS=Check $setup_log for complete error details" - } > /root/failure - + error "Setup encountered an unrecoverable failure, exiting" + touch /root/failure exit 1 }