Merge remote-tracking branch 'origin/2.4/dev' into idstools-refactor

This commit is contained in:
DefensiveDepth
2025-09-17 10:42:43 -04:00
253 changed files with 183225 additions and 162248 deletions

View File

@@ -454,6 +454,7 @@ function add_sensor_to_minion() {
echo "sensor:"
echo " interface: '$INTERFACE'"
echo " mtu: 9000"
echo " channels: 1"
echo "zeek:"
echo " enabled: True"
echo " config:"

View File

@@ -419,6 +419,7 @@ preupgrade_changes() {
[[ "$INSTALLEDVERSION" == 2.4.141 ]] && up_to_2.4.150
[[ "$INSTALLEDVERSION" == 2.4.150 ]] && up_to_2.4.160
[[ "$INSTALLEDVERSION" == 2.4.160 ]] && up_to_2.4.170
[[ "$INSTALLEDVERSION" == 2.4.170 ]] && up_to_2.4.180
true
}
@@ -448,6 +449,7 @@ postupgrade_changes() {
[[ "$POSTVERSION" == 2.4.141 ]] && post_to_2.4.150
[[ "$POSTVERSION" == 2.4.150 ]] && post_to_2.4.160
[[ "$POSTVERSION" == 2.4.160 ]] && post_to_2.4.170
[[ "$POSTVERSION" == 2.4.170 ]] && post_to_2.4.180
true
}
@@ -588,9 +590,6 @@ post_to_2.4.160() {
}
post_to_2.4.170() {
echo "Regenerating Elastic Agent Installers"
/sbin/so-elastic-agent-gen-installers
# Update kibana default space
salt-call state.apply kibana.config queue=True
echo "Updating Kibana default space"
@@ -599,6 +598,16 @@ post_to_2.4.170() {
POSTVERSION=2.4.170
}
post_to_2.4.180() {
echo "Regenerating Elastic Agent Installers"
/sbin/so-elastic-agent-gen-installers
# Force update to Kafka output policy
/usr/sbin/so-kafka-fleet-output-policy --force
POSTVERSION=2.4.180
}
repo_sync() {
echo "Sync the local repo."
su socore -c '/usr/sbin/so-repo-sync' || fail "Unable to complete so-repo-sync."
@@ -850,10 +859,15 @@ up_to_2.4.170() {
touch /opt/so/saltstack/local/pillar/$state/adv_$state.sls /opt/so/saltstack/local/pillar/$state/soc_$state.sls
done
INSTALLEDVERSION=2.4.170
}
up_to_2.4.180() {
# Elastic Update for this release, so download Elastic Agent files
determine_elastic_agent_upgrade
INSTALLEDVERSION=2.4.170
INSTALLEDVERSION=2.4.180
}
add_hydra_pillars() {

View File

@@ -0,0 +1,254 @@
{%- from 'manager/map.jinja' import MANAGERMERGED -%}
{%- set OFFLINE_THRESHOLD_HOURS = MANAGERMERGED.agent_monitoring.config.offline_threshold -%}
{%- set PAGE_SIZE = MANAGERMERGED.agent_monitoring.config.page_size -%}
{%- set CUSTOM_KQUERY = MANAGERMERGED.agent_monitoring.config.custom_kquery -%}
{%- set REALERT_THRESHOLD = MANAGERMERGED.agent_monitoring.config.realert_threshold -%}
#!/bin/bash
set -euo pipefail
LOG_DIR="/opt/so/log/agents"
LOG_FILE="$LOG_DIR/agent-monitor.log"
CURL_CONFIG="/opt/so/conf/elasticsearch/curl.config"
FLEET_API="http://localhost:5601/api/fleet/agents"
{#- When using custom kquery ignore critical agents patterns. Since we want all the results of custom query logged #}
{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
CRITICAL_AGENTS_FILE="/dev/null"
{%- else %}
CRITICAL_AGENTS_FILE="/opt/so/conf/agents/critical-agents.txt"
{%- endif %}
OFFLINE_THRESHOLD_HOURS={{ OFFLINE_THRESHOLD_HOURS }}
REALERT_THRESHOLD={{ REALERT_THRESHOLD }}
PAGE_SIZE="{{ PAGE_SIZE }}"
log_message() {
local level="$1"
local message="$2"
echo "$(date -u +"%Y-%m-%dT%H:%M:%SZ") [$level] $message" >&2
}
matches_critical_pattern() {
local hostname="$1"
local pattern_file="$2"
# If critical agents file doesn't exist or is empty, match all
if [ ! -f "$pattern_file" ] || [ ! -s "$pattern_file" ]; then
return 0
fi
local hostname_lower=$(echo "$hostname" | tr '[:upper:]' '[:lower:]')
while IFS= read -r pattern || [ -n "$pattern" ]; do
# empty lines and comments
[[ -z "$pattern" || "$pattern" =~ ^[[:space:]]*# ]] && continue
# cut whitespace
pattern=$(echo "$pattern" | xargs)
local pattern_lower=$(echo "$pattern" | tr '[:upper:]' '[:lower:]')
# Replace * with bash wildcard
local bash_pattern="${pattern_lower//\*/.*}"
# Check if hostname matches the pattern
if [[ "$hostname_lower" =~ ^${bash_pattern}$ ]]; then
return 0
fi
done < "$pattern_file"
return 1
}
calculate_offline_hours() {
local last_checkin="$1"
local current_time=$(date +%s)
local checkin_time=$(date -d "$last_checkin" +%s 2>/dev/null || echo "0")
if [ "$checkin_time" -eq "0" ]; then
echo "0"
return
fi
local diff=$((current_time - checkin_time))
echo $((diff / 3600))
}
check_recent_log_entries() {
local agent_hostname="$1"
if [ ! -f "$LOG_FILE" ]; then
return 1
fi
local current_time=$(date +%s)
local threshold_seconds=$((REALERT_THRESHOLD * 3600))
local agent_hostname_lower=$(echo "$agent_hostname" | tr '[:upper:]' '[:lower:]')
local most_recent_timestamp=""
while IFS= read -r line; do
[ -z "$line" ] && continue
local logged_hostname=$(echo "$line" | jq -r '.["agent.hostname"] // empty' 2>/dev/null)
local logged_timestamp=$(echo "$line" | jq -r '.["@timestamp"] // empty' 2>/dev/null)
[ -z "$logged_hostname" ] || [ -z "$logged_timestamp" ] && continue
local logged_hostname_lower=$(echo "$logged_hostname" | tr '[:upper:]' '[:lower:]')
if [ "$logged_hostname_lower" = "$agent_hostname_lower" ]; then
most_recent_timestamp="$logged_timestamp"
fi
done < <(tail -n 1000 "$LOG_FILE" 2>/dev/null)
# If there is agent entry (within last 1000), check the time difference
if [ -n "$most_recent_timestamp" ]; then
local logged_time=$(date -d "$most_recent_timestamp" +%s 2>/dev/null || echo "0")
if [ "$logged_time" -ne "0" ]; then
local time_diff=$((current_time - logged_time))
local hours_diff=$((time_diff / 3600))
# Skip if last agent timestamp was more recent than realert threshold
if ((hours_diff < REALERT_THRESHOLD)); then
return 0
fi
fi
fi
# Agent has not been logged within realert threshold
return 1
}
main() {
log_message "INFO" "Starting Fleet agent status check"
# Check if critical agents file is configured
if [ -f "$CRITICAL_AGENTS_FILE" ] && [ -s "$CRITICAL_AGENTS_FILE" ]; then
log_message "INFO" "Using critical agents filter from: $CRITICAL_AGENTS_FILE"
log_message "INFO" "Patterns: $(grep -v '^#' "$CRITICAL_AGENTS_FILE" 2>/dev/null | xargs | tr ' ' ',')"
else
log_message "INFO" "No critical agents filter found, monitoring all agents"
fi
log_message "INFO" "Querying Fleet API"
local page=1
local total_agents=0
local processed_agents=0
local current_timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
{%- if CUSTOM_KQUERY != None and CUSTOM_KQUERY | length > 0 %}
log_message "INFO" "Using custom kquery: {{ CUSTOM_KQUERY }}"
FLEET_QUERY="${FLEET_API}?kuery={{ CUSTOM_KQUERY | urlencode }}&perPage=${PAGE_SIZE}&page=${page}"
{%- else %}
log_message "INFO" "Using default query (all offline or degraded agents)"
FLEET_QUERY="${FLEET_API}?kuery=status%3Aoffline%20OR%20status%3Adegraded&perPage=${PAGE_SIZE}&page=${page}"
{%- endif %}
while true; do
log_message "INFO" "Fetching page $page (${PAGE_SIZE} agents per page)"
if ! response_body=$(curl -K "$CURL_CONFIG" \
-s --fail \
"$FLEET_QUERY" \
-H 'kbn-xsrf: true' 2>/dev/null); then
log_message "ERROR" "Failed to query Fleet API (page $page)"
exit 1
fi
# pagination info
current_total=$(echo "$response_body" | jq -r '.total // 0')
current_page=$(echo "$response_body" | jq -r '.page // 1')
agents_in_page=$(echo "$response_body" | jq -r '.list | length')
# Update total
if [ "$page" -eq 1 ]; then
total_agents="$current_total"
log_message "INFO" "Found $total_agents total agents across all pages"
fi
log_message "INFO" "Processing page $current_page with $agents_in_page agents"
# Process agents from current page
mapfile -t agents < <(echo "$response_body" | jq -c '.list[]')
for agent in "${agents[@]}"; do
# Grab agent details
agent_id=$(echo "$agent" | jq -r '.id // "unknown"')
agent_hostname=$(echo "$agent" | jq -r '.local_metadata.host.hostname // "unknown"')
agent_name=$(echo "$agent" | jq -r '.local_metadata.host.name // "unknown"')
agent_status=$(echo "$agent" | jq -r '.status // "unknown"')
last_checkin=$(echo "$agent" | jq -r '.last_checkin // ""')
last_checkin_status=$(echo "$agent" | jq -r '.last_checkin_status // "unknown"')
policy_id=$(echo "$agent" | jq -r '.policy_id // "unknown"')
# Only log agents that are offline or degraded (skip inactive agents)
# Fleetserver agents can show multiple versions as 'inactive'
if [ "$agent_status" = "offline" ] || [ "$agent_status" = "degraded" ]; then
# Check if agent matches critical agent patterns (if configured)
if ! matches_critical_pattern "$agent_hostname" "$CRITICAL_AGENTS_FILE"; then
log_message "WARN" "${agent_hostname^^} is ${agent_status^^}, but does not match configured critical agents patterns. Not logging ${agent_status^^} agent"
continue # Skip this agent if it doesn't match any critical agent pattern
fi
offline_hours=$(calculate_offline_hours "$last_checkin")
if [ "$offline_hours" -lt "$OFFLINE_THRESHOLD_HOURS" ]; then
log_message "INFO" "${agent_hostname^^} has been offline for ${offline_hours}h (threshold: ${OFFLINE_THRESHOLD_HOURS}h). Not logging ${agent_status^^} agent until it reaches threshold"
continue
fi
# Check if this agent was already logged within the realert_threshold
if check_recent_log_entries "$agent_hostname"; then
log_message "INFO" "Skipping $agent_hostname (status: $agent_status) - already logged within last ${REALERT_THRESHOLD}h"
continue
fi
log_entry=$(echo 'null' | jq -c \
--arg ts "$current_timestamp" \
--arg id "$agent_id" \
--arg hostname "$agent_hostname" \
--arg name "$agent_name" \
--arg status "$agent_status" \
--arg last_checkin "$last_checkin" \
--arg last_checkin_status "$last_checkin_status" \
--arg policy_id "$policy_id" \
--arg offline_hours "$offline_hours" \
'{
"@timestamp": $ts,
"agent.id": $id,
"agent.hostname": $hostname,
"agent.name": $name,
"agent.status": $status,
"agent.last_checkin": $last_checkin,
"agent.last_checkin_status": $last_checkin_status,
"agent.policy_id": $policy_id,
"agent.offline_duration_hours": ($offline_hours | tonumber)
}')
echo "$log_entry" >> "$LOG_FILE"
log_message "INFO" "Logged offline agent: $agent_hostname (status: $agent_status, offline: ${offline_hours}h)"
fi
done
processed_agents=$((processed_agents + agents_in_page))
if [ "$agents_in_page" -eq 0 ] || [ "$processed_agents" -ge "$total_agents" ]; then
log_message "INFO" "Completed processing all pages. Total processed: $processed_agents agents"
break
fi
page=$((page + 1))
# Limit pagination loops incase of any issues. If agent count is high enough increase page_size in SOC manager.agent_monitoring.config.page_size
if [ "$page" -gt 100 ]; then
log_message "ERROR" "Reached maximum page limit (100). Issue with script or extremely large fleet deployment. Consider increasing page_size in SOC -> manager.agent_monitoring.config.page_size"
break
fi
done
log_message "INFO" "Fleet agent status check completed. Processed $processed_agents out of $total_agents agents"
}
main "$@"

View File

@@ -15,6 +15,7 @@ require_manager
echo
echo "This script will remove the current Elastic Fleet install and all of its data and then rerun Elastic Fleet setup."
echo "Deployed Elastic Agents will no longer be enrolled and will need to be reinstalled."
echo "Only the Elastic Fleet instance on the Manager will be reinstalled - dedicated Fleet node config will removed and will need to be reinstalled."
echo "This script should only be used as a last resort to reinstall Elastic Fleet."
echo
echo "If you would like to proceed, then type AGREE and press ENTER."