Merge remote-tracking branch 'origin/2.4/dev' into idstools-refactor

2025-12-22 08:53:06 +01:00 · 2025-09-17 10:42:43 -04:00
parent a77157391c 562b7e54cb
commit ded520c2c1
253 changed files with 183225 additions and 162248 deletions
--- a/salt/manager/tools/sbin/so-minion
+++ b/salt/manager/tools/sbin/so-minion
@@ -454,6 +454,7 @@ function add_sensor_to_minion() {
        echo "sensor:"
        echo "  interface: '$INTERFACE'"
        echo "  mtu: 9000"
+		echo "  channels: 1"
        echo "zeek:"
        echo "  enabled: True"
        echo "  config:"
--- a/salt/manager/tools/sbin/soup
+++ b/salt/manager/tools/sbin/soup
@@ -419,6 +419,7 @@ preupgrade_changes() {
    [[ "$INSTALLEDVERSION" == 2.4.141 ]] && up_to_2.4.150
    [[ "$INSTALLEDVERSION" == 2.4.150 ]] && up_to_2.4.160
    [[ "$INSTALLEDVERSION" == 2.4.160 ]] && up_to_2.4.170
+    [[ "$INSTALLEDVERSION" == 2.4.170 ]] && up_to_2.4.180
    true
 }

@@ -448,6 +449,7 @@ postupgrade_changes() {
    [[ "$POSTVERSION"  == 2.4.141 ]] && post_to_2.4.150
    [[ "$POSTVERSION"  == 2.4.150 ]] && post_to_2.4.160
    [[ "$POSTVERSION"  == 2.4.160 ]] && post_to_2.4.170
+    [[ "$POSTVERSION"  == 2.4.170 ]] && post_to_2.4.180
    true
 }

@@ -588,9 +590,6 @@ post_to_2.4.160() {
 }

 post_to_2.4.170() {
-  echo "Regenerating Elastic Agent Installers"
-  /sbin/so-elastic-agent-gen-installers
-
  # Update kibana default space
  salt-call state.apply kibana.config queue=True
  echo "Updating Kibana default space"
@@ -599,6 +598,16 @@ post_to_2.4.170() {
  POSTVERSION=2.4.170
 }

+post_to_2.4.180() {
+  echo "Regenerating Elastic Agent Installers"
+  /sbin/so-elastic-agent-gen-installers
+
+  # Force update to Kafka output policy
+  /usr/sbin/so-kafka-fleet-output-policy --force
+
+  POSTVERSION=2.4.180
+}
+
 repo_sync() {
  echo "Sync the local repo."
  su socore -c '/usr/sbin/so-repo-sync' || fail "Unable to complete so-repo-sync."
@@ -850,10 +859,15 @@ up_to_2.4.170() {
    touch /opt/so/saltstack/local/pillar/$state/adv_$state.sls /opt/so/saltstack/local/pillar/$state/soc_$state.sls
  done

+
+  INSTALLEDVERSION=2.4.170
+}
+
+up_to_2.4.180() {
  # Elastic Update for this release, so download Elastic Agent files
  determine_elastic_agent_upgrade

-  INSTALLEDVERSION=2.4.170
+  INSTALLEDVERSION=2.4.180
 }

 add_hydra_pillars() {
--- a/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
+++ b/salt/manager/tools/sbin_jinja/so-elastic-agent-monitor
@@ -0,0 +1,254 @@
+{%- from 'manager/map.jinja' import MANAGERMERGED -%}
+{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
+{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
+{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
+{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%}
+#!/bin/bash
+
+set -euo pipefail
+
+LOG_DIR="/opt/so/log/agents"
+LOG_FILE="$LOG_DIR/agent-monitor.log"
+CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
+FLEET_API="http://localhost:5601/api/fleet/agents"
+{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
+{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
+CRITICAL_AGENTS_FILE="/dev/null"
+{%- else %}
+CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
+{%- endif %}
+OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
+REALERT_THRESHOLD={{ REALERT_THRESHOLD }}
+PAGE_SIZE="{{ PAGE_SIZE }}"
+
+log_message() {
+    local level="$1"
+    local message="$2"
+    echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
+}
+
+matches_critical_pattern() {
+    local hostname="$1"
+    local pattern_file="$2"
+    
+    # If critical agents file doesn't exist or is empty, match all
+    if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
+        return 0
+    fi
+    
+    local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
+    
+    while IFS= read -r pattern || [ -n "$pattern" ]; do
+        # empty lines and comments
+        [[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
+        
+        # cut whitespace
+        pattern=$(echo "$pattern" | xargs)
+        
+        local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
+        
+        # Replace * with bash wildcard
+        local bash_pattern="${pattern_lower//\*/.*}"
+        
+        # Check if hostname matches the pattern
+        if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
+            return 0
+        fi
+    done < "$pattern_file"
+    
+    return 1
+}
+
+calculate_offline_hours() {
+    local last_checkin="$1"
+    local current_time=$(date +%s)
+    local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
+    
+    if [ "$checkin_time" -eq "0" ]; then
+        echo "0"
+        return
+    fi
+    
+    local diff=$((current_time - checkin_time))
+    echo $((diff / 3600))
+}
+
+check_recent_log_entries() {
+    local agent_hostname="$1"
+
+    if [ ! -f "$LOG_FILE" ]; then
+        return 1
+    fi
+
+    local current_time=$(date +%s)
+    local threshold_seconds=$((REALERT_THRESHOLD * 3600))
+    local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]')
+    local most_recent_timestamp=""
+
+    while IFS= read -r line; do
+        [ -z "$line" ] && continue
+
+        local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null)
+        local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null)
+
+        [ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue
+
+        local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]')
+
+        if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then
+            most_recent_timestamp="$logged_timestamp"
+        fi
+    done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null)
+
+    # If there is agent entry (within last 1000), check the time difference
+    if [ -n "$most_recent_timestamp" ]; then
+        local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0")
+
+        if [ "$logged_time" -ne "0" ]; then
+            local time_diff=$((current_time - logged_time))
+            local hours_diff=$((time_diff / 3600))
+
+            # Skip if last agent timestamp was more recent than realert threshold
+            if ((hours_diff < REALERT_THRESHOLD)); then
+                return 0
+            fi
+        fi
+    fi
+
+    # Agent has not been logged within realert threshold
+    return 1
+}
+
+main() {
+    log_message "INFO" "Starting Fleet agent status check"
+
+    # Check if critical agents file is configured
+    if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
+        log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
+        log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
+    else
+        log_message "INFO" "No critical agents filter found, monitoring all agents"
+    fi
+
+    log_message "INFO" "Querying Fleet API"
+
+    local page=1
+    local total_agents=0
+    local processed_agents=0
+    local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+
+    {%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
+    log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}"
+    FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}"
+    {%- else %}
+    log_message "INFO" "Using default query (all offline or degraded agents)"
+    FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}"
+    {%- endif %}
+
+    while true; do
+        log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
+
+        if ! response_body=$(curl -K "$CURL_CONFIG" \
+            -s --fail \
+            "$FLEET_QUERY" \
+            -H 'kbn-xsrf: true' 2>/dev/null); then
+            log_message "ERROR" "Failed to query Fleet API (page $page)"
+            exit 1
+        fi
+
+        # pagination info
+        current_total=$(echo "$response_body" | jq -r '.total // 0')
+        current_page=$(echo "$response_body" | jq -r '.page // 1')
+        agents_in_page=$(echo "$response_body" | jq -r '.list | length')
+
+        # Update total
+        if [ "$page" -eq 1 ]; then
+            total_agents="$current_total"
+            log_message "INFO" "Found $total_agents total agents across all pages"
+        fi
+
+        log_message "INFO" "Processing page $current_page with $agents_in_page agents"
+
+        # Process agents from current page
+        mapfile -t agents < <(echo "$response_body" | jq -c '.list[]')
+
+        for agent in "${agents[@]}"; do
+            # Grab agent details
+            agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
+            agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
+            agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
+            agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
+            last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
+            last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
+            policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
+
+            # Only log agents that are offline or degraded (skip inactive agents)
+            # Fleetserver agents can show multiple versions as 'inactive'
+            if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
+                # Check if agent matches critical agent patterns (if configured)
+                if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
+                    log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent"
+                    continue  # Skip this agent if it doesn't match any critical agent pattern
+                fi
+
+                offline_hours=$(calculate_offline_hours "$last_checkin")
+
+                if [ "$offline_hours" -lt "$OFFLINE_THRESHOLD_HOURS" ]; then
+                    log_message "INFO" "${agent_hostname^^} has been offline for ${offline_hours}h (threshold: ${OFFLINE_THRESHOLD_HOURS}h). Not logging ${agent_status^^} agent until it reaches threshold"
+                    continue
+                fi
+
+                # Check if this agent was already logged within the realert_threshold
+                if check_recent_log_entries "$agent_hostname"; then
+                    log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h"
+                    continue
+                fi
+
+                log_entry=$(echo 'null' | jq -c \
+                    --arg ts "$current_timestamp" \
+                    --arg id "$agent_id" \
+                    --arg hostname "$agent_hostname" \
+                    --arg name "$agent_name" \
+                    --arg status "$agent_status" \
+                    --arg last_checkin "$last_checkin" \
+                    --arg last_checkin_status "$last_checkin_status" \
+                    --arg policy_id "$policy_id" \
+                    --arg offline_hours "$offline_hours" \
+                    '{
+                        "@timestamp": $ts,
+                        "agent.id": $id,
+                        "agent.hostname": $hostname,
+                        "agent.name": $name,
+                        "agent.status": $status,
+                        "agent.last_checkin": $last_checkin,
+                        "agent.last_checkin_status": $last_checkin_status,
+                        "agent.policy_id": $policy_id,
+                        "agent.offline_duration_hours": ($offline_hours | tonumber)
+                    }')
+
+                echo "$log_entry" >> "$LOG_FILE"
+
+                log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
+            fi
+        done
+
+        processed_agents=$((processed_agents + agents_in_page))
+
+        if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
+            log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
+            break
+        fi
+
+        page=$((page + 1))
+
+        # Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
+        if [ "$page" -gt 100 ]; then
+            log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
+            break
+        fi
+    done
+
+    log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
+}
+
+main "$@"
--- a/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset
+++ b/salt/manager/tools/sbin_jinja/so-elastic-fleet-reset
@@ -15,6 +15,7 @@ require_manager
 echo
 echo "This script will remove the current Elastic Fleet install and all of its data and then rerun Elastic Fleet setup."
 echo "Deployed Elastic Agents will no longer be enrolled and will need to be reinstalled."
+echo "Only the Elastic Fleet instance on the Manager will be reinstalled - dedicated Fleet node config will removed and will need to be reinstalled."
 echo "This script should only be used as a last resort to reinstall Elastic Fleet." 
 echo
 echo "If you would like to proceed, then type AGREE and press ENTER."