Compare commits

..

4 Commits

Author SHA1 Message Date
Jorge Reyes 63a2e20698 Merge pull request #15982 from Security-Onion-Solutions/reyesj2/wip
don't create stack trace when set -e is disabled
2026-06-18 15:25:41 -05:00
reyesj2 22d5c96bd5 don't create stack trace when set -e is disabled 2026-06-18 14:56:29 -05:00
Mike Reeves 28fdd1eb6f Merge pull request #15970 from Security-Onion-Solutions/udev
Pin NIC names by MAC via udev (run-once) from the common state
2026-06-18 14:28:09 -04:00
Mike Reeves 80c39d612c Pin NIC names by MAC via udev (run-once) from the common state
Add so-nic-pin, which writes by-MAC persistent-net udev rules pinning each
physical NIC to its current name so a kernel upgrade can't renumber the
interfaces Security Onion binds by name (host:mainint, sensor:mainint, bond0).

Gated by the drop file /opt/so/state/nic_names_pinned: run-once on highstate,
and an admin can pre-create the marker to opt out. Wired into common/init.sls
as pin_nic_names, guarded by a matching unless.
2026-06-11 18:40:43 -04:00
9 changed files with 387 additions and 381 deletions
+11
View File
@@ -130,6 +130,17 @@ common_sbin:
- so-pcap-import
{% endif %}
# Pin physical NIC names by MAC (run-once) so a kernel upgrade can't renumber the
# interfaces SO binds by name. The marker keeps it a one-time setup; an admin can
# pre-create the marker to opt out.
pin_nic_names:
cmd.run:
- name: /usr/sbin/so-nic-pin
- unless: 'test -e /opt/so/state/nic_names_pinned'
- require:
- file: common_sbin
- file: statedir
common_sbin_jinja:
file.recurse:
- name: /usr/sbin
+76
View File
@@ -0,0 +1,76 @@
#!/bin/bash
#
# so-nic-pin — pin physical NIC names by permanent MAC via classic by-MAC udev
# rules, so a kernel upgrade can't renumber them.
#
# Security Onion binds its management and monitor interfaces BY NAME in pillar
# (host:mainint, sensor:mainint, and bond0 is built on a specific physical NIC).
# A kernel upgrade can change the kernel/systemd-udevd predictable-naming output
# and renumber those NICs (e.g. enp1s0 -> enp2s0), which breaks the grid: the
# pillar references a name that no longer exists and bond/bridge bring-up fails.
#
# This writes /etc/udev/rules.d/70-persistent-net.rules pinning each PHYSICAL NIC
# to its CURRENT name by its PERMANENT MAC, freezing the names across future kernel
# changes. It only writes the rules file; it does NOT live-trigger a rename (the
# rules apply on the next boot/kernel, and a live rename would be disruptive).
#
# Run-once: gated by the drop file /opt/so/state/nic_names_pinned. If the marker is
# present the script does nothing, so an admin can pre-create it to opt out. Invoked
# from the common state on every highstate; the marker keeps it a one-time setup.
NET_RULES_FILE="/etc/udev/rules.d/70-persistent-net.rules"
MARKER="/opt/so/state/nic_names_pinned"
log() { echo -e "[so-nic-pin] $*"; }
# Echo "<name> <permanent-mac>" for every PHYSICAL NIC. A physical NIC is backed by a
# real device (has device/driver), which excludes bond0/sobridge/docker0/veth*/lo whose
# MACs are dynamic and must never be pinned. The PERMANENT MAC is used (ethtool -P, with
# fallbacks), not the current one: an enslaved bond member's current MAC is rewritten to
# the bond's, so matching on it would be wrong/ambiguous.
physical_nics() {
local path n mac
for path in /sys/class/net/*; do
n="${path##*/}"
[ "$n" = "lo" ] && continue
[ -e "${path}/device/driver" ] || continue # real device only
mac="$(ethtool -P "$n" 2>/dev/null | awk '/Permanent address/{print $NF}')"
case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/bonding_slave/perm_hwaddr" 2>/dev/null)" ;; esac
case "$mac" in ""|00:00:00:00:00:00) mac="$(cat "${path}/address" 2>/dev/null)" ;; esac
case "$mac" in ""|00:00:00:00:00:00) continue ;; esac
echo "$n $mac"
done
}
# Turn "<name> <mac>" lines on stdin into classic by-MAC persistent-net udev rules.
render_net_rules() {
echo "# Generated by so-nic-pin: pin NIC names by MAC so kernel upgrades can't renumber them."
echo "# Security Onion binds its management/monitor interfaces by name; do not hand-edit."
local n mac
while read -r n mac; do
[ -n "$n" ] || continue
printf 'SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", ATTR{address}=="%s", NAME="%s"\n' \
"$mac" "$n"
done
}
[ "$(id -u)" -eq 0 ] || exit 0 # salt runs us as root; bail quietly otherwise
[ -e "${MARKER}" ] && exit 0 # run-once guard (mirrors the state's unless)
nics="$(physical_nics)"
if [ -z "${nics}" ]; then
log "no physical NICs detected — nothing to pin (will retry on next highstate)"
exit 0 # do NOT drop the marker; let it retry later
fi
log "pinning physical NICs by permanent MAC:"
echo "${nics}" | sed 's/^/ /'
[ -f "${NET_RULES_FILE}" ] && cp -f "${NET_RULES_FILE}" "${NET_RULES_FILE}.bak"
echo "${nics}" | render_net_rules > "${NET_RULES_FILE}" || {
log "ERROR: failed to write ${NET_RULES_FILE}"
exit 1
}
mkdir -p "$(dirname "${MARKER}")" && touch "${MARKER}"
log "wrote ${NET_RULES_FILE} ($(grep -c '^SUBSYSTEM' "${NET_RULES_FILE}") NIC(s) pinned); dropped ${MARKER}"
@@ -30,82 +30,6 @@ fleet_api() {
curl -sK /opt/so/conf/elasticsearch/curl.config -L "localhost:5601/api/fleet/${QUERYPATH}" "$@" --retry 3 --retry-delay 10 --fail 2>/dev/null
}
# Max number of concurrent Fleet write jobs (create/update). Override via env if needed.
MAX_FLEET_JOBS=${MAX_FLEET_JOBS:-10}
# Block until fewer than MAX_FLEET_JOBS background jobs are running.
elastic_fleet_throttle() {
while (( $(jobs -rp | wc -l) >= MAX_FLEET_JOBS )); do
wait -n
done
}
# Load every integration JSON in a directory into a single agent policy.
# The agent policy is fetched ONCE (not per file), and the create/update writes
# are dispatched as throttled background jobs.
# $1 AGENT_POLICY - the agent policy id/name to load integrations into
# $2 DIR - directory of integration *.json files
# $3 LABEL - human-readable label for log output
# $4 SKIP_CREATE_NAME - (optional) integration name to skip when creating (still updated if present)
# Returns 1 if any integration failed to create/update.
elastic_fleet_load_integrations_dir() {
local AGENT_POLICY=$1
local DIR=$2
local LABEL=$3
local SKIP_CREATE_NAME=$4
local POLICY_JSON FAIL_FILE OUT_DIR INTEGRATION NAME ID i
FAIL_FILE=$(mktemp)
# Each job buffers its full output (header + API response) into its own file so the
# parent can print them grouped and in submission order after concurrent writes finish.
OUT_DIR=$(mktemp -d)
i=0
# Fetch the agent policy a single time; we look up integration ids locally below.
POLICY_JSON=$(fleet_api "agent_policies/$AGENT_POLICY")
for INTEGRATION in "$DIR"/*.json; do
[ -e "$INTEGRATION" ] || continue
NAME=$(jq -r .name "$INTEGRATION")
ID=$(jq -r --arg n "$NAME" '.item.package_policies[]? | select(.name==$n) | .id' <<<"$POLICY_JSON")
elastic_fleet_throttle
{
local RESP
if [ -n "$ID" ]; then
printf "\n\n%s - Updating integration %s\n" "$LABEL" "$NAME"
if ! RESP=$(elastic_fleet_integration_update "$ID" "@$INTEGRATION"); then
flock 9; echo "update ${INTEGRATION##*/}" >&9
fi
printf '%s\n' "$RESP"
elif [ -n "$SKIP_CREATE_NAME" ] && [ "$NAME" == "$SKIP_CREATE_NAME" ]; then
printf "\n\n%s - Skipping creation of %s\n" "$LABEL" "$NAME"
else
printf "\n\n%s - Creating integration %s\n" "$LABEL" "$NAME"
if ! RESP=$(elastic_fleet_integration_create "@$INTEGRATION"); then
flock 9; echo "create ${INTEGRATION##*/}" >&9
fi
printf '%s\n' "$RESP"
fi
} >"$OUT_DIR/$(printf '%03d' "$i")" 9>>"$FAIL_FILE" &
i=$((i+1))
done
wait
# Emit per-integration output grouped and in submission order (glob sorts numerically).
cat "$OUT_DIR"/* 2>/dev/null
rm -rf "$OUT_DIR"
local rc=0
if [ -s "$FAIL_FILE" ]; then
printf "\n%s: failed integrations:\n" "$LABEL"
cat "$FAIL_FILE"
rc=1
fi
rm -f "$FAIL_FILE"
return $rc
}
elastic_fleet_integration_check() {
AGENT_POLICY=$1
@@ -122,9 +46,7 @@ elastic_fleet_integration_create() {
JSON_STRING=$1
# --retry-all-errors so transient 409 conflicts (concurrent writes to the same agent
# policy) are retried; curl --retry alone does not retry 409.
if ! fleet_api "package_policies" --retry-all-errors -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPOST -d "$JSON_STRING"; then
if ! fleet_api "package_policies" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPOST -d "$JSON_STRING"; then
return 1
fi
}
@@ -155,9 +77,7 @@ elastic_fleet_integration_update() {
JSON_STRING=$2
# --retry-all-errors so transient 409 conflicts (concurrent writes to the same agent
# policy) are retried; curl --retry alone does not retry 409.
if ! fleet_api "package_policies/$UPDATE_ID" --retry-all-errors -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPUT -d "$JSON_STRING"; then
if ! fleet_api "package_policies/$UPDATE_ID" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -XPUT -d "$JSON_STRING"; then
return 1
fi
}
@@ -18,26 +18,93 @@ if [ ! -f /opt/so/state/eaintegrations.txt ]; then
# Third, configure Elastic Defend Integration seperately
/usr/sbin/so-elastic-fleet-integration-policy-elastic-defend
# Each group fetches its agent policy once and dispatches create/update writes concurrently.
# Initial Endpoints
elastic_fleet_load_integrations_dir "endpoints-initial" \
/opt/so/conf/elastic-fleet/integrations/endpoints-initial "Initial Endpoints Policy" || RETURN_CODE=1
for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/endpoints-initial/*.json; do
printf "\n\nInitial Endpoints Policy - Loading $INTEGRATION\n"
elastic_fleet_integration_check "endpoints-initial" "$INTEGRATION"
if [ -n "$INTEGRATION_ID" ]; then
printf "\n\nIntegration $NAME exists - Updating integration\n"
if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
else
printf "\n\nIntegration does not exist - Creating integration\n"
if ! elastic_fleet_integration_create "@$INTEGRATION"; then
echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
fi
done
# Grid Nodes - General
elastic_fleet_load_integrations_dir "so-grid-nodes_general" \
/opt/so/conf/elastic-fleet/integrations/grid-nodes_general "Grid Nodes Policy_General" || RETURN_CODE=1
for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/grid-nodes_general/*.json; do
printf "\n\nGrid Nodes Policy_General - Loading $INTEGRATION\n"
elastic_fleet_integration_check "so-grid-nodes_general" "$INTEGRATION"
if [ -n "$INTEGRATION_ID" ]; then
printf "\n\nIntegration $NAME exists - Updating integration\n"
if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
else
printf "\n\nIntegration does not exist - Creating integration\n"
if ! elastic_fleet_integration_create "@$INTEGRATION"; then
echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
fi
done
# Grid Nodes - Heavy
elastic_fleet_load_integrations_dir "so-grid-nodes_heavy" \
/opt/so/conf/elastic-fleet/integrations/grid-nodes_heavy "Grid Nodes Policy_Heavy" || RETURN_CODE=1
for INTEGRATION in /opt/so/conf/elastic-fleet/integrations/grid-nodes_heavy/*.json; do
printf "\n\nGrid Nodes Policy_Heavy - Loading $INTEGRATION\n"
elastic_fleet_integration_check "so-grid-nodes_heavy" "$INTEGRATION"
if [ -n "$INTEGRATION_ID" ]; then
printf "\n\nIntegration $NAME exists - Updating integration\n"
if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
else
printf "\n\nIntegration does not exist - Creating integration\n"
if ! elastic_fleet_integration_create "@$INTEGRATION"; then
echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
fi
done
# Fleet Server - Optional integrations (one agent policy per FleetServer_* directory)
for FLEET_DIR in /opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/; do
[ -d "$FLEET_DIR" ] || continue
FLEET_POLICY=$(basename "$FLEET_DIR")
elastic_fleet_load_integrations_dir "$FLEET_POLICY" \
"${FLEET_DIR%/}" "Fleet Server Policy" "elasticsearch-logs" || RETURN_CODE=1
# Fleet Server - Optional integrations
for INTEGRATION in /opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/*.json; do
if ! [ "$INTEGRATION" == "/opt/so/conf/elastic-fleet/integrations-optional/FleetServer*/*.json" ]; then
FLEET_POLICY=`echo "$INTEGRATION"| cut -d'/' -f7`
printf "\n\nFleet Server Policy - Loading $INTEGRATION\n"
elastic_fleet_integration_check "$FLEET_POLICY" "$INTEGRATION"
if [ -n "$INTEGRATION_ID" ]; then
printf "\n\nIntegration $NAME exists - Updating integration\n"
if ! elastic_fleet_integration_update "$INTEGRATION_ID" "@$INTEGRATION"; then
echo -e "\nFailed to update integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
else
printf "\n\nIntegration does not exist - Creating integration\n"
if [ "$NAME" != "elasticsearch-logs" ]; then
if ! elastic_fleet_integration_create "@$INTEGRATION"; then
echo -e "\nFailed to create integration for ${INTEGRATION##*/}"
RETURN_CODE=1
continue
fi
fi
fi
fi
done
# Only create the state file if all policies were created/updated successfully
@@ -23,90 +23,73 @@ if [ $? -ne 0 ]; then
fi
default_packages=({% for pkg in SUPPORTED_PACKAGES %}"{{ pkg }}"{% if not loop.last %} {% endif %}{% endfor %})
# JSON array of the default packages, used by the jq filter below.
default_packages_json=$(printf '%s\n' "${default_packages[@]}" | jq -R . | jq -s '.')
# Output lock (serializes concurrent job output) and failure file (one marker line per
# failed integration). Mirrors the pattern used by elastic_fleet_load_integrations_dir.
OUTPUT_LOCK=$(mktemp)
FAIL_FILE=$(mktemp)
trap 'rm -f "$OUTPUT_LOCK" "$FAIL_FILE"' EXIT
# Cache of package name -> latest available version, so the same package is only looked up
# once instead of once per (policy, integration).
declare -A LATEST_VERSION_CACHE
ERROR=false
for AGENT_POLICY in $agent_policies; do
# Fetch the agent policy a single time; package name/version and integration id are all
# extracted locally below instead of re-fetching the same policy per integration.
if ! POLICY_JSON=$(fleet_api "agent_policies/$AGENT_POLICY"); then
if ! integrations=$(elastic_fleet_integration_policy_names "$AGENT_POLICY"); then
# this script upgrades default integration packages, exit 1 and let salt handle retrying
exit 1
fi
# One jq pass emits name/package.name/package.version/id for every eligible integration.
# The endpoint/fleet_server skips and the default-package gate are applied here in jq.
# $defaults (not $def, a jq reserved keyword) holds the default package list.
while IFS=$'\t' read -r INTEGRATION PACKAGE_NAME PACKAGE_VERSION INTEGRATION_ID; do
[ -n "$INTEGRATION" ] || continue
# Look up the latest available version once per package, then memoize it.
if [[ -z "${LATEST_VERSION_CACHE[$PACKAGE_NAME]+set}" ]]; then
if ! AVAILABLE_VERSION=$(elastic_fleet_package_latest_version_check "$PACKAGE_NAME"); then
echo "Error: Failed getting latest version for $PACKAGE_NAME"
for INTEGRATION in $integrations; do
if ! [[ "$INTEGRATION" == "elastic-defend-endpoints" ]] && ! [[ "$INTEGRATION" == "fleet_server-"* ]]; then
# Get package name so we know what package to look for when checking the current and latest available version
if ! PACKAGE_NAME=$(elastic_fleet_integration_policy_package_name "$AGENT_POLICY" "$INTEGRATION"); then
exit 1
fi
LATEST_VERSION_CACHE[$PACKAGE_NAME]=$AVAILABLE_VERSION
fi
AVAILABLE_VERSION=${LATEST_VERSION_CACHE[$PACKAGE_NAME]}
if [[ "$PACKAGE_VERSION" != "$AVAILABLE_VERSION" ]]; then
# Dry run, then (if clean) the actual upgrade, dispatched as a throttled background
# job. Each job builds its full log into one block, then flushes it under a single
# shared lock (OUTPUT_LOCK) so concurrent jobs never interleave on stdout; a failed
# job also appends a marker line to FAIL_FILE while holding that same lock.
elastic_fleet_throttle
{
block=$'\n'"Current $PACKAGE_NAME package version ($PACKAGE_VERSION) is not the same as the latest available package ($AVAILABLE_VERSION)..."$'\n'
block+="Upgrading $INTEGRATION..."$'\n'"Starting dry run..."$'\n'
fail=""
if ! DRYRUN_OUTPUT=$(elastic_fleet_integration_policy_dryrun_upgrade "$INTEGRATION_ID"); then
block+="Error: Failed to complete dry run for '$INTEGRATION_ID'."$'\n'
fail="dryrun $INTEGRATION"
elif [[ "$(jq .[].hasErrors <<<"$DRYRUN_OUTPUT")" == "false" ]]; then
block+="No errors detected. Proceeding with upgrade..."$'\n'
if ! elastic_fleet_integration_policy_upgrade "$INTEGRATION_ID"; then
block+="Error: Upgrade failed for $PACKAGE_NAME with integration ID '$INTEGRATION_ID'."$'\n'
fail="upgrade $INTEGRATION"
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
if [[ " ${default_packages[@]} " =~ " $PACKAGE_NAME " ]]; then
{%- endif %}
# Get currently installed version of package
attempt=0
max_attempts=3
while [ $attempt -lt $max_attempts ]; do
if PACKAGE_VERSION=$(elastic_fleet_integration_policy_package_version "$AGENT_POLICY" "$INTEGRATION") && AVAILABLE_VERSION=$(elastic_fleet_package_latest_version_check "$PACKAGE_NAME"); then
break
fi
else
block+="Errors detected during dry run for $PACKAGE_NAME policy upgrade..."$'\n'
fail="dryrun-errors $INTEGRATION"
attempt=$((attempt + 1))
done
if [ $attempt -eq $max_attempts ]; then
echo "Error: Failed getting $PACKAGE_VERSION or $AVAILABLE_VERSION"
exit 1
fi
{
flock 9
printf '%s' "$block"
[ -n "$fail" ] && printf '%s\n' "$fail" >>"$FAIL_FILE"
} 9>>"$OUTPUT_LOCK"
} &
# Get integration ID
if ! INTEGRATION_ID=$(elastic_fleet_integration_id "$AGENT_POLICY" "$INTEGRATION"); then
exit 1
fi
if [[ "$PACKAGE_VERSION" != "$AVAILABLE_VERSION" ]]; then
# Dry run of the upgrade
echo ""
echo "Current $PACKAGE_NAME package version ($PACKAGE_VERSION) is not the same as the latest available package ($AVAILABLE_VERSION)..."
echo "Upgrading $INTEGRATION..."
echo "Starting dry run..."
if ! DRYRUN_OUTPUT=$(elastic_fleet_integration_policy_dryrun_upgrade "$INTEGRATION_ID"); then
exit 1
fi
DRYRUN_ERRORS=$(echo "$DRYRUN_OUTPUT" | jq .[].hasErrors)
# If no errors with dry run, proceed with actual upgrade
if [[ "$DRYRUN_ERRORS" == "false" ]]; then
echo "No errors detected. Proceeding with upgrade..."
if ! elastic_fleet_integration_policy_upgrade "$INTEGRATION_ID"; then
echo "Error: Upgrade failed for $PACKAGE_NAME with integration ID '$INTEGRATION_ID'."
ERROR=true
continue
fi
else
echo "Errors detected during dry run for $PACKAGE_NAME policy upgrade..."
ERROR=true
continue
fi
fi
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
fi
{%- endif %}
fi
done < <(jq -r --argjson defaults "$default_packages_json" '
.item.package_policies[]
| select(.name != "elastic-defend-endpoints")
| select(.name | startswith("fleet_server-") | not)
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
| select(.package.name | IN($defaults[]))
{%- endif %}
| [.name, .package.name, .package.version, .id] | @tsv
' <<<"$POLICY_JSON")
done
done
# Barrier: wait for every dispatched dry-run/upgrade job to finish.
wait
if [ -s "$FAIL_FILE" ]; then
printf '\nFailed integration upgrades:\n'
cat "$FAIL_FILE"
if [[ "$ERROR" == "true" ]]; then
exit 1
fi
echo
@@ -16,6 +16,7 @@
STATE_FILE_SUCCESS=/opt/so/state/estemplates.txt
INSTALLED_PACKAGE_LIST=/tmp/esfleet_installed_packages.json
BULK_INSTALL_PACKAGE_LIST=/tmp/esfleet_bulk_install.json
BULK_INSTALL_PACKAGE_TMP=/tmp/esfleet_bulk_install_tmp.json
BULK_INSTALL_OUTPUT=/opt/so/state/esfleet_bulk_install_results.json
INTEGRATION_PACKAGE_COMPONENTS=/opt/so/state/esfleet_package_components.json
INPUT_PACKAGE_COMPONENTS=/opt/so/state/esfleet_input_package_components.json
@@ -28,6 +29,29 @@ PENDING_UPDATE=false
# Requiring some level of manual Elastic Stack configuration before installation
EXCLUDED_INTEGRATIONS=('apm')
version_conversion(){
version=$1
echo "$version" | awk -F '.' '{ printf("%d%03d%03d\n", $1, $2, $3); }'
}
compare_versions() {
version1=$1
version2=$2
# Convert versions to numbers
num1=$(version_conversion "$version1")
num2=$(version_conversion "$version2")
# Compare using bc
if (( $(echo "$num1 < $num2" | bc -l) )); then
echo "less"
elif (( $(echo "$num1 > $num2" | bc -l) )); then
echo "greater"
else
echo "equal"
fi
}
IFS=$'\n'
agent_policies=$(elastic_fleet_agent_policy_ids)
if [ $? -ne 0 ]; then
@@ -39,23 +63,23 @@ default_packages=({% for pkg in SUPPORTED_PACKAGES %}"{{ pkg }}"{% if not loop.l
in_use_integrations=()
# Fetch each agent policy once; its package_policies[] already contain both the integration name
# and the .package.name, so extract all non-default package names locally in a single jq instead
# of re-fetching the same policy per integration.
default_packages_json=$(printf '%s\n' "${default_packages[@]}" | jq -R . | jq -s '.')
for AGENT_POLICY in $agent_policies; do
if ! policy_json=$(fleet_api "agent_policies/$AGENT_POLICY"); then
if ! integrations=$(elastic_fleet_integration_policy_names "$AGENT_POLICY"); then
# skip the agent policy if we can't get required info, let salt retry. Integrations loaded by this script are non-default integrations.
echo "Skipping $AGENT_POLICY.. "
continue
fi
# non-default integrations that are in-use in any policy
while IFS= read -r PACKAGE_NAME; do
[ -n "$PACKAGE_NAME" ] && in_use_integrations+=("$PACKAGE_NAME")
done < <(jq -r --argjson defaults "$default_packages_json" \
'.item.package_policies[].package.name | select(. as $n | ($defaults | index($n)) | not)' \
<<<"$policy_json")
for INTEGRATION in $integrations; do
if ! PACKAGE_NAME=$(elastic_fleet_integration_policy_package_name "$AGENT_POLICY" "$INTEGRATION"); then
echo "Not adding $INTEGRATION, couldn't get package name"
continue
fi
# non-default integrations that are in-use in any policy
if ! [[ " ${default_packages[@]} " =~ " $PACKAGE_NAME " ]]; then
in_use_integrations+=("$PACKAGE_NAME")
fi
done
done
if [[ -f $STATE_FILE_SUCCESS ]]; then
@@ -66,55 +90,72 @@ if [[ -f $STATE_FILE_SUCCESS ]]; then
rm -f $INSTALLED_PACKAGE_LIST
echo $latest_package_list | jq '{packages: [.items[] | {name: .name, latest_version: .version, installed_version: .installationInfo.version, subscription: .conditions.elastic.subscription }]}' >> $INSTALLED_PACKAGE_LIST
# Build the bulk install list and the per-package status messages with two jq passes
# instead of a per-package bash loop. The old loop forked ~10 processes per package
# (5 jq + awk/bc for the version compare) and re-parsed/rewrote a growing JSON file on
# every add (O(n^2)). Selection and messages below are identical to that logic.
SUB={% if SUB %}true{% else %}false{% endif %}
AUTOUP={% if AUTO_UPGRADE_INTEGRATIONS %}true{% else %}false{% endif %}
EXCLUDED_JSON=$(printf '%s\n' "${EXCLUDED_INTEGRATIONS[@]}" | jq -R 'select(length>0)' | jq -s '.')
INUSE_JSON=$(printf '%s\n' "${in_use_integrations[@]}" | jq -R 'select(length>0)' | jq -s 'unique')
while read -r package; do
# get package details
package_name=$(echo "$package" | jq -r '.name')
latest_version=$(echo "$package" | jq -r '.latest_version')
installed_version=$(echo "$package" | jq -r '.installed_version')
subscription=$(echo "$package" | jq -r '.subscription')
bulk_package=$(echo "$package" | jq '{name: .name, version: .latest_version}' )
# vnum replicates the previous version_conversion (%d%03d%03d of the first three dotted
# fields); needs() replicates the excluded/subscription/installed/upgrade/in-use logic.
JQ_DECISION='
def vnum:
[ (split(".")|.[0:3][] | gsub("[^0-9].*";"") | (if .=="" then "0" else . end) | tonumber) ]
| (.[0]//0)*1000000 + (.[1]//0)*1000 + (.[2]//0);
def needs($sub;$autoup;$excluded;$inuse):
.name as $n
| ($n | IN($excluded[]) | not)
and ( $sub or (.subscription==null or .subscription=="basic" or .subscription=="") )
and ( (.installed_version==null or .installed_version=="")
or ( ((.latest_version|vnum) > (.installed_version|vnum))
and ( $autoup or ($n | IN($inuse[]) | not) ) ) );'
if [[ ! "${EXCLUDED_INTEGRATIONS[@]}" =~ "$package_name" ]]; then
{% if not SUB %}
if [[ "$subscription" != "basic" && "$subscription" != "null" && -n "$subscription" ]]; then
# pass over integrations that require non-basic elastic license
echo "$package_name integration requires an Elastic license of $subscription or greater... skipping"
continue
else
if [[ "$installed_version" == "null" || -z "$installed_version" ]]; then
echo "$package_name is not installed... Adding to next update."
jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
JQ_ARGS=(--argjson sub "$SUB" --argjson autoup "$AUTOUP" --argjson excluded "$EXCLUDED_JSON" --argjson inuse "$INUSE_JSON")
PENDING_UPDATE=true
else
results=$(compare_versions "$latest_version" "$installed_version")
if [ $results == "greater" ]; then
{#- When auto_upgrade_integrations is false, skip upgrading in_use_integrations #}
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
if ! [[ " ${in_use_integrations[@]} " =~ " $package_name " ]]; then
{%- endif %}
echo "$package_name is at version $installed_version latest version is $latest_version... Adding to next update."
jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
# (a) Per-package status messages (parity with the previous echo output).
jq -r "${JQ_ARGS[@]}" "$JQ_DECISION"'
.packages[]
| .name as $n
| if ($n|IN($excluded[])) then "Skipping \($n)..."
elif (($sub|not) and (.subscription!=null and .subscription!="basic" and .subscription!="")) then
"\($n) integration requires an Elastic license of \(.subscription) or greater... skipping"
elif (.installed_version==null or .installed_version=="") then
"\($n) is not installed... Adding to next update."
elif ((.latest_version|vnum) > (.installed_version|vnum)) then
(if ($autoup or ($n|IN($inuse[])|not))
then "\($n) is at version \(.installed_version) latest version is \(.latest_version)... Adding to next update."
else "skipping available upgrade for in use integration - \($n)." end)
else empty end
' "$INSTALLED_PACKAGE_LIST"
# (b) The bulk install list, built in a single pass.
jq "${JQ_ARGS[@]}" "$JQ_DECISION"'
{packages: [ .packages[] | select(needs($sub;$autoup;$excluded;$inuse)) | {name, version: .latest_version} ]}
' "$INSTALLED_PACKAGE_LIST" > "$BULK_INSTALL_PACKAGE_LIST"
if jq -e '.packages | length > 0' "$BULK_INSTALL_PACKAGE_LIST" >/dev/null; then
PENDING_UPDATE=true
fi
PENDING_UPDATE=true
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
else
echo "skipping available upgrade for in use integration - $package_name."
fi
{%- endif %}
fi
fi
fi
{% else %}
if [[ "$installed_version" == "null" || -z "$installed_version" ]]; then
echo "$package_name is not installed... Adding to next update."
jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
PENDING_UPDATE=true
else
results=$(compare_versions "$latest_version" "$installed_version")
if [ $results == "greater" ]; then
{#- When auto_upgrade_integrations is false, skip upgrading in_use_integrations #}
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
if ! [[ " ${in_use_integrations[@]} " =~ " $package_name " ]]; then
{%- endif %}
echo "$package_name is at version $installed_version latest version is $latest_version... Adding to next update."
jq --argjson package "$bulk_package" '.packages += [$package]' $BULK_INSTALL_PACKAGE_LIST > $BULK_INSTALL_PACKAGE_TMP && mv $BULK_INSTALL_PACKAGE_TMP $BULK_INSTALL_PACKAGE_LIST
PENDING_UPDATE=true
{%- if not AUTO_UPGRADE_INTEGRATIONS %}
else
echo "skipping available upgrade for in use integration - $package_name."
fi
{%- endif %}
fi
fi
{% endif %}
else
echo "Skipping $package_name..."
fi
done <<< "$(jq -c '.packages[]' "$INSTALLED_PACKAGE_LIST")"
if [ "$PENDING_UPDATE" = true ]; then
# Run chunked install of packages
@@ -11,8 +11,10 @@ ADDON_STATEFILE_SUCCESS=/opt/so/state/addon_estemplates.txt
ELASTICSEARCH_TEMPLATES_DIR="/opt/so/conf/elasticsearch/templates"
SO_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR}/index"
ADDON_TEMPLATES_DIR="${ELASTICSEARCH_TEMPLATES_DIR}/addon-index"
FAILED_NAMES=()
FAILED_COUNT=0
SO_LOAD_FAILURES=0
ADDON_LOAD_FAILURES=0
SO_LOAD_FAILURES_NAMES=()
ADDON_LOAD_FAILURES_NAMES=()
IS_HEAVYNODE="false"
FORCE="false"
VERBOSE="false"
@@ -44,86 +46,20 @@ while [[ $# -gt 0 ]]; do
shift
done
# Max number of concurrent template PUT jobs. Override via env if needed.
MAX_TEMPLATE_JOBS=${MAX_TEMPLATE_JOBS:-10}
# Block until fewer than MAX_TEMPLATE_JOBS background jobs are running.
template_throttle() {
while (( $(jobs -rp | wc -l) >= MAX_TEMPLATE_JOBS )); do
wait -n
done
}
# Per-job failure markers and an output lock for serializing parallel job output.
# Each failed load drops one file (named after the template) into FAIL_DIR; the
# output of each job is flushed as a single block under flock so concurrent jobs
# never interleave their (chatty) retry output.
FAIL_DIR=$(mktemp -d)
OUTPUT_LOCK="${FAIL_DIR}/.output.lock"
: > "$OUTPUT_LOCK"
trap 'rm -rf "$FAIL_DIR"' EXIT
# Record a failure: $1 = the template name/path to report later. Slashes are
# encoded so the path becomes a safe single filename.
record_failure() {
local marker="${1//\//__}"
: > "${FAIL_DIR}/fail.${marker}"
}
# Populate FAILED_NAMES and FAILED_COUNT from the current phase's markers.
# Must run in the current shell (not a command substitution) so the array sticks.
collect_failures() {
FAILED_NAMES=()
FAILED_COUNT=0
local f name
shopt -s nullglob
for f in "${FAIL_DIR}"/fail.*; do
name="${f##*/fail.}"
name="${name//__//}"
FAILED_NAMES+=("$name")
FAILED_COUNT=$((FAILED_COUNT + 1))
done
shopt -u nullglob
}
# Clear markers and names between phases so SO and addon counts stay independent.
reset_failures() {
shopt -s nullglob
rm -f "${FAIL_DIR}"/fail.*
shopt -u nullglob
FAILED_NAMES=()
FAILED_COUNT=0
}
# Print a block of text atomically (under the shared output lock) so the output
# of concurrent background jobs is not interleaved.
locked_echo() {
{ flock 9; printf '%s\n' "$1"; } 9>>"$OUTPUT_LOCK"
}
# Loads one template file via PUT. Intended to be dispatched as a background job.
# $1 uri - e.g. _component_template/foo or _index_template/foo
# $2 file - path to the template JSON
# $3 report_name - name/path to record if this load fails
load_template() {
local uri="$1"
local file="$2"
local report_name="$3"
local out rc=0 block
# Capture everything (including retry's diagnostic chatter) into one block so
# concurrent jobs never interleave; the whole block is flushed under one flock.
block="Loading template file $file"$'\n'
if ! out=$(retry 3 3 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}" 2>&1); then
block+="$out"$'\n'
rc=1
echo "Loading template file $file"
if ! output=$(retry 3 3 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}"); then
echo "$output"
return 1
elif [[ "$VERBOSE" == "true" ]]; then
block+="$out"$'\n'
echo "$output"
fi
{ flock 9; printf '%s' "$block"; } 9>>"$OUTPUT_LOCK"
(( rc != 0 )) && record_failure "$report_name"
}
check_required_component_template_exists() {
@@ -174,9 +110,6 @@ load_component_templates() {
return
fi
# Dispatch loads as throttled background jobs. The barrier (wait) happens in
# the caller after all component groups have been dispatched, since index
# templates must not load until every component template is in place.
for component in "$pattern"/*.json; do
tmpl_name=$(basename "${component%.json}")
@@ -185,8 +118,10 @@ load_component_templates() {
tmpl_name="${tmpl_name%-mappings}-mappings"
fi
template_throttle
load_template "_component_template/${tmpl_name}" "$component" "$component" &
if ! load_template "_component_template/${tmpl_name}" "$component"; then
SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
SO_LOAD_FAILURES_NAMES+=("$component")
fi
done
}
@@ -237,9 +172,6 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
load_component_templates "Elastic Agent" "elastic-agent"
load_component_templates "Security Onion" "so"
# Barrier: every component template PUT must complete before we snapshot the
# component template list and start loading index templates that depend on them.
wait
component_templates=$(so-elasticsearch-component-templates-list)
echo -e "Loading Security Onion index templates...\n"
for so_idx_tmpl in "${SO_TEMPLATES_DIR}"/*.json; do
@@ -249,7 +181,7 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
# TODO: Better way to load only heavynode specific templates
if ! check_heavynode_compatiable_index_template "$tmpl_name"; then
if [[ "$VERBOSE" == "true" ]]; then
locked_echo "Skipping over $so_idx_tmpl, template is not a heavynode specific index template."
echo "Skipping over $so_idx_tmpl, template is not a heavynode specific index template."
fi
continue
@@ -257,34 +189,32 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_e
fi
if check_required_component_template_exists "$so_idx_tmpl"; then
template_throttle
load_template "_index_template/$tmpl_name" "$so_idx_tmpl" "$so_idx_tmpl" &
if ! load_template "_index_template/$tmpl_name" "$so_idx_tmpl"; then
SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
SO_LOAD_FAILURES_NAMES+=("$so_idx_tmpl")
fi
else
locked_echo "Skipping over $so_idx_tmpl due to missing required component template(s)."
record_failure "$so_idx_tmpl"
echo "Skipping over $so_idx_tmpl due to missing required component template(s)."
SO_LOAD_FAILURES=$((SO_LOAD_FAILURES + 1))
SO_LOAD_FAILURES_NAMES+=("$so_idx_tmpl")
continue
fi
done
# Barrier: all SO index template PUTs must finish before tallying failures.
wait
collect_failures
if [[ $FAILED_COUNT -eq 0 ]]; then
if [[ $SO_LOAD_FAILURES -eq 0 ]]; then
echo "All Security Onion core templates loaded successfully."
touch "$SO_STATEFILE_SUCCESS"
else
echo "Encountered $FAILED_COUNT failure(s) loading templates:"
for failed_template in "${FAILED_NAMES[@]}"; do
echo "Encountered $SO_LOAD_FAILURES failure(s) loading templates:"
for failed_template in "${SO_LOAD_FAILURES_NAMES[@]}"; do
echo " - $failed_template"
done
if [[ "$SHOULD_EXIT_ON_FAILURE" == "true" ]]; then
fail "Failed to load all Security Onion core templates successfully."
fi
fi
reset_failures
elif ! index_templates_exist "$SO_TEMPLATES_DIR"; then
echo "No Security Onion core index templates found in ${SO_TEMPLATES_DIR}, skipping."
elif [[ -f "$SO_STATEFILE_SUCCESS" ]]; then
@@ -303,27 +233,26 @@ if should_load_addon_templates; then
tmpl_name=$(basename "${addon_idx_tmpl%-template.json}")
if check_required_component_template_exists "$addon_idx_tmpl"; then
template_throttle
load_template "_index_template/${tmpl_name}" "$addon_idx_tmpl" "$addon_idx_tmpl" &
if ! load_template "_index_template/${tmpl_name}" "$addon_idx_tmpl"; then
ADDON_LOAD_FAILURES=$((ADDON_LOAD_FAILURES + 1))
ADDON_LOAD_FAILURES_NAMES+=("$addon_idx_tmpl")
fi
else
locked_echo "Skipping over $addon_idx_tmpl due to missing required component template(s)."
record_failure "$addon_idx_tmpl"
echo "Skipping over $addon_idx_tmpl due to missing required component template(s)."
ADDON_LOAD_FAILURES=$((ADDON_LOAD_FAILURES + 1))
ADDON_LOAD_FAILURES_NAMES+=("$addon_idx_tmpl")
continue
fi
done
# Barrier: all addon index template PUTs must finish before tallying failures.
wait
collect_failures
if [[ $FAILED_COUNT -eq 0 ]]; then
if [[ $ADDON_LOAD_FAILURES -eq 0 ]]; then
echo "All addon integration templates loaded successfully."
touch "$ADDON_STATEFILE_SUCCESS"
else
echo "Encountered $FAILED_COUNT failure(s) loading addon integration templates:"
for failed_template in "${FAILED_NAMES[@]}"; do
echo "Encountered $ADDON_LOAD_FAILURES failure(s) loading addon integration templates:"
for failed_template in "${ADDON_LOAD_FAILURES_NAMES[@]}"; do
echo " - $failed_template"
done
if [[ "$SHOULD_EXIT_ON_FAILURE" == "true" ]]; then
@@ -6,37 +6,6 @@
. /usr/sbin/so-common
MAX_JOBS=10
# Lock used to serialize block writes so concurrent jobs never interleave their output.
ILM_OUTPUT_LOCK=$(mktemp)
trap 'rm -f "$ILM_OUTPUT_LOCK"' EXIT
# Policies are loaded concurrently (up to MAX_JOBS at a time) for speed. Each policy's block is
# printed the moment its curl returns, so output appears in COMPLETION ORDER, not the order
# policies are defined in configuration.
echo "Loading ILM policies concurrently; output below appears in completion order, not configuration order."
echo
put_policy() {
local desc="$1" policyname="$2" data="$3" result
result=$(curl -K /opt/so/conf/elasticsearch/curl.config -s -k -L \
-X PUT "https://localhost:9200/_ilm/policy/${policyname}" \
-H 'Content-Type: application/json' -d"${data}")
# curl above ran in parallel; serialize just this block write so concurrent jobs never interleave.
{
flock 200
printf 'Setting up %s policy...\n%s\n\n' "${desc}" "${result}"
} 200>>"${ILM_OUTPUT_LOCK}"
}
# Block until fewer than MAX_JOBS background curls are running.
throttle() {
while (( $(jobs -rp | wc -l) >= MAX_JOBS )); do
wait -n
done
}
{%- from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS %}
{%- if GLOBALS.role != "so-heavynode" %}
{%- from 'elasticsearch/template.map.jinja' import ALL_ADDON_SETTINGS %}
@@ -45,26 +14,35 @@ throttle() {
{%- for index, settings in ES_INDEX_SETTINGS.items() %}
{%- if settings.policy is defined %}
{%- if index == 'so-logs-detections.alerts' %}
throttle
put_policy "so-logs-detections.alerts-so" "{{ index }}-so" '{ "policy": {{ settings.policy | tojson(true) }} }' &
echo
echo "Setting up so-logs-detections.alerts-so policy..."
curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-so" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
echo
{%- elif index == 'so-logs-soc' %}
throttle
put_policy "so-soc-logs" "so-soc-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
throttle
put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
echo
echo "Setting up so-soc-logs policy..."
curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/so-soc-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
echo
echo
echo "Setting up {{ index }}-logs policy..."
curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
echo
{%- else %}
throttle
put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
echo
echo "Setting up {{ index }}-logs policy..."
curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
echo
{%- endif %}
{%- endif %}
{%- endfor %}
echo
{%- if GLOBALS.role != "so-heavynode" %}
{%- for index, settings in ALL_ADDON_SETTINGS.items() %}
{%- if settings.policy is defined %}
throttle
put_policy "{{ index }}-logs" "{{ index }}-logs" '{ "policy": {{ settings.policy | tojson(true) }} }' &
echo
echo "Setting up {{ index }}-logs policy..."
curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }'
echo
{%- endif %}
{%- endfor %}
{%- endif %}
wait
+6 -5
View File
@@ -131,6 +131,8 @@ check_err() {
# Collect bash error context before passing off to check_err()
on_err() {
local exit_code=$?
# Ignore failures in blocks that explicitly disabled errexit with `set +e`.
[[ $- == *e* ]] || return $exit_code
# turn off xtrace to prevent added noise in debug log
set +x 2>/dev/null || true
@@ -384,11 +386,10 @@ highstate() {
masterlock() {
echo "Locking Salt Master"
mv -v $TOPFILE $BACKUPTOPFILE
# Render the real top file only for the host running soup; every other
# minion gets an empty top (no states) while the master is upgrading.
echo "{% if grains['id'] == '$MINIONID' %}" > $TOPFILE
cat $BACKUPTOPFILE >> $TOPFILE
echo "{% endif %}" >> $TOPFILE
echo "base:" > $TOPFILE
echo " $MINIONID:" >> $TOPFILE
echo " - ca" >> $TOPFILE
echo " - elasticsearch" >> $TOPFILE
}
masterunlock() {