diff --git a/.github/DISCUSSION_TEMPLATE/3-0.yml b/.github/DISCUSSION_TEMPLATE/3-0.yml index 3fb9e5b30..8f74145c4 100644 --- a/.github/DISCUSSION_TEMPLATE/3-0.yml +++ b/.github/DISCUSSION_TEMPLATE/3-0.yml @@ -11,6 +11,7 @@ body: - - 3.0.0 - 3.1.0 + - 3.2.0 - Other (please provide detail below) validations: required: true diff --git a/DOWNLOAD_AND_VERIFY_ISO.md b/DOWNLOAD_AND_VERIFY_ISO.md index 47937c1b9..bae49c4ac 100644 --- a/DOWNLOAD_AND_VERIFY_ISO.md +++ b/DOWNLOAD_AND_VERIFY_ISO.md @@ -1,17 +1,17 @@ -### 3.0.0-20260331 ISO image released on 2026/03/31 +### 3.1.0-20260528 ISO image released on 2026/05/28 ### Download and Verify -3.0.0-20260331 ISO image: -https://download.securityonion.net/file/securityonion/securityonion-3.0.0-20260331.iso +3.1.0-20260528 ISO image: +https://download.securityonion.net/file/securityonion/securityonion-3.1.0-20260528.iso -MD5: ECD318A1662A6FDE0EF213F5A9BD4B07 -SHA1: E55BE314440CCF3392DC0B06BC5E270B43176D9C -SHA256: 7FC47405E335CBE5C2B6C51FE7AC60248F35CBE504907B8B5A33822B23F8F4D5 +MD5: 9D6FF58DEEE24089D722C73169765B3E +SHA1: 2B8B816B6CEC3B7F96B3C5E040EBF502DD2C412F +SHA256: 62FAB57E247C843D6A04F0796D8162C732B65D82FC3E4A59D087135B9FD32912 Signature for ISO image: -https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.0.0-20260331.iso.sig +https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.1.0-20260528.iso.sig Signing key: https://raw.githubusercontent.com/Security-Onion-Solutions/securityonion/3/main/KEYS @@ -25,22 +25,22 @@ wget https://raw.githubusercontent.com/Security-Onion-Solutions/securityonion/3/ Download the signature file for the ISO: ``` -wget https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.0.0-20260331.iso.sig +wget https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.1.0-20260528.iso.sig ``` Download the ISO image: ``` -wget https://download.securityonion.net/file/securityonion/securityonion-3.0.0-20260331.iso +wget https://download.securityonion.net/file/securityonion/securityonion-3.1.0-20260528.iso ``` Verify the downloaded ISO image using the signature file: ``` -gpg --verify securityonion-3.0.0-20260331.iso.sig securityonion-3.0.0-20260331.iso +gpg --verify securityonion-3.1.0-20260528.iso.sig securityonion-3.1.0-20260528.iso ``` The output should show "Good signature" and the Primary key fingerprint should match what's shown below: ``` -gpg: Signature made Mon 30 Mar 2026 06:22:14 PM EDT using RSA key ID FE507013 +gpg: Signature made Wed 27 May 2026 03:03:59 PM EDT using RSA key ID FE507013 gpg: Good signature from "Security Onion Solutions, LLC " gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. diff --git a/HOTFIX b/HOTFIX index e69de29bb..8b1378917 100644 --- a/HOTFIX +++ b/HOTFIX @@ -0,0 +1 @@ + diff --git a/VERSION b/VERSION index fd2a01863..944880fa1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.1.0 +3.2.0 diff --git a/salt/common/tools/sbin/so-log-check b/salt/common/tools/sbin/so-log-check index 94fdd7229..65b1041fe 100755 --- a/salt/common/tools/sbin/so-log-check +++ b/salt/common/tools/sbin/so-log-check @@ -166,6 +166,7 @@ if [[ $EXCLUDE_FALSE_POSITIVE_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|upgrading composable template" # false positive (elasticsearch composable template names contain 'error') EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Error while parsing document for index \[.ds-logs-kratos-so-.*object mapping for \[file\]" # false positive (mapping error occuring BEFORE kratos index has rolled over in 2.4.210) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|No such container" # false positive (telegraf trying to run stats on an old container) + EXCLUDED_ERRORS="$EXCLUDED_ERRORS|passwords do not match" # false positive (automated hydra test) fi if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then diff --git a/salt/elasticfleet/enabled.sls b/salt/elasticfleet/enabled.sls index fb0420109..2bebf3339 100644 --- a/salt/elasticfleet/enabled.sls +++ b/salt/elasticfleet/enabled.sls @@ -26,7 +26,9 @@ include: wait_for_elasticsearch_elasticfleet: cmd.run: - name: so-elasticsearch-wait +{% endif %} +{% if GLOBALS.role == "so-fleet" %} # Sync Elastic Agent artifacts to Fleet Node elasticagent_syncartifacts: file.recurse: diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index bd3048019..135c51276 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -370,8 +370,9 @@ preupgrade_changes() { # This function is to add any new pillar items if needed. echo "Checking to see if changes are needed." - [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0 + [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0 [[ "$INSTALLEDVERSION" == "3.0.0" ]] && up_to_3.1.0 + [[ "$INSTALLEDVERSION" == "3.1.0" ]] && up_to_3.2.0 true } @@ -381,6 +382,7 @@ postupgrade_changes() { [[ "$POSTVERSION" =~ ^2\.4\.21[0-9]+$ ]] && post_to_3.0.0 [[ "$POSTVERSION" == "3.0.0" ]] && post_to_3.1.0 + [[ "$POSTVERSION" == "3.1.0" ]] && post_to_3.2.0 true } @@ -533,6 +535,23 @@ elasticfleet_set_agent_logging_level_warn() { done <<< "$policies_to_update" } +update_logstash_pipeline_name() { + local original_pipeline_name="$1" + local new_pipeline_name="$2" + + echo "Checking for conflicting logstash defined_pipelines pillar value." + local LOGSTASH_FILE=/opt/so/saltstack/local/pillar/logstash/soc_logstash.sls + local MINIONDIR=/opt/so/saltstack/local/pillar/minions + for pillar_file in "$LOGSTASH_FILE" "$MINIONDIR"/*.sls; do + [[ -f "$pillar_file" ]] || continue + if grep -q "$original_pipeline_name$" "$pillar_file"; then + echo "Found conflicting defined_pipeline pillar value in $pillar_file. Updating to use the new logstash pipeline name." + sed -i "s#$original_pipeline_name\$#$new_pipeline_name#g" "$pillar_file" + chown socore:socore "$pillar_file" + fi + done +} + check_transform_health_and_reauthorize() { . /usr/sbin/so-elastic-fleet-common @@ -676,6 +695,10 @@ rename_strelka_scan_lnk() { rm -f "$TMP_VALUE_FILE" } +fix_logstash_0013_lumberjack_pipeline_name() { + update_logstash_pipeline_name "so/0013_input_lumberjack_fleet.conf" "so/0013_input_lumberjack_fleet.conf.jinja" +} + up_to_3.1.0() { ensure_postgres_local_pillar ensure_postgres_secret @@ -684,6 +707,7 @@ up_to_3.1.0() { # Clear existing component template state file. rm -f /opt/so/state/esfleet_component_templates.json rename_strelka_scan_lnk + fix_logstash_0013_lumberjack_pipeline_name INSTALLEDVERSION=3.1.0 } @@ -720,6 +744,42 @@ post_to_3.1.0() { ### 3.1.0 End ### +### 3.2.0 Scripts ### + +bootstrap_so_soc_database() { + # init-db.sh is mounted into so-postgres at /docker-entrypoint-initdb.d/init-db.sh + # and runs automatically only on a fresh data directory. Hosts upgrading from + # 3.1.0 already have /nsm/postgres populated, so the so_soc bootstrap block + # added in 3.2 never fires. Re-run the script explicitly; it's idempotent. + echo "Bootstrapping so_soc database via init-db.sh." + # The postgres image has no USER directive, so `docker exec` defaults to + # root, and the container env intentionally omits POSTGRES_USER (the upstream + # entrypoint defaults it transiently during first-init only). Recreate both + # so psql inside init-db.sh resolves the connect user correctly. + local exec_cmd="docker exec -u postgres -e POSTGRES_USER=postgres so-postgres bash /docker-entrypoint-initdb.d/init-db.sh" + if ! /usr/sbin/so-postgres-wait; then + FINAL_MESSAGE_QUEUE+=("WARNING: so-postgres was not ready during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd") + return 0 + fi + if ! $exec_cmd; then + FINAL_MESSAGE_QUEUE+=("WARNING: init-db.sh failed inside so-postgres during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd") + return 0 + fi + echo "so_soc bootstrap complete." +} + +up_to_3.2.0() { + INSTALLEDVERSION=3.2.0 +} + +post_to_3.2.0() { + bootstrap_so_soc_database + + POSTVERSION=3.2.0 +} + +### 3.2.0 End ### + repo_sync() { echo "Sync the local repo." @@ -971,6 +1031,9 @@ verify_es_version_compatibility() { local is_active_intermediate_upgrade=1 # supported upgrade paths for SO-ES versions declare -A es_upgrade_map=( + ["8.18.4"]="8.18.6 8.18.8 9.0.8" + ["8.18.6"]="8.18.8 9.0.8" + ["8.18.8"]="9.0.8" ["9.0.8"]="9.3.3" ) @@ -994,6 +1057,171 @@ verify_es_version_compatibility() { exit 160 fi + compatible_es_versions="$target_es_version" + for current_version in "${!es_upgrade_map[@]}"; do + # shellcheck disable=SC2076 + if [[ " ${es_upgrade_map[$current_version]} " =~ " $target_es_version " ]]; then + compatible_es_versions+=" $current_version" + fi + done + + # Check if the given ES version can directly upgrade to the target ES version. Used to assist with catching lagging nodes during the upgrade process + es_version_can_upgrade_to_target() { + local current_version="$1" + # shellcheck disable=SC2076 + if [[ -n "$current_version" && " $compatible_es_versions " =~ " $current_version " ]]; then + return 0 + fi + + return 1 + } + + # Gather Elasticsearch cluster version info and verify that each node in the cluster is running a version compatible with the target ES version. + verify_searchnodes_es_target_compatibility() { + local retries=20 + local retry_count=0 + local delay=180 + local expected_es_nodes searchnode_minions attempt + local searchnode_discovery_success=false + SEARCHNODE_ES_VERSIONS="" + + for attempt in {1..3}; do + if searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then + searchnode_discovery_success=true + break + fi + + echo "Failed to retrieve grid searchnodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3." + sleep 30 + done + + if [[ "$searchnode_discovery_success" != "true" ]]; then + echo "Failed to retrieve grid searchnodes via salt-key." + return 1 + fi + + # Always add node running soup to expected es nodes + expected_es_nodes="${MINIONID%_*}" + while IFS= read -r searchnode_minion; do + [[ -z "$searchnode_minion" ]] && continue + expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}" + done <<< "$searchnode_minions" + + while [[ $retry_count -lt $retries ]]; do + SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1) + local exit_status=$? + + if [[ $exit_status -ne 0 ]]; then + echo "Failed to retrieve Elasticsearch versions from searchnodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + local all_searchnodes_compatible=true + while IFS=$'\t' read -r node current_version; do + [[ -z "$node" ]] && continue + if ! es_version_can_upgrade_to_target "$current_version"; then + echo "Searchnode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version." + all_searchnodes_compatible=false + fi + done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv') + + while IFS= read -r expected_es_node; do + [[ -z "$expected_es_node" ]] && continue + if ! echo "$SEARCHNODE_ES_VERSIONS" | jq -e --arg node "$expected_es_node" '.nodes | to_entries | any(.value.name == $node)' > /dev/null; then + echo "Searchnode $expected_es_node did not report an Elasticsearch version. It may be offline or still upgrading." + all_searchnodes_compatible=false + fi + done <<< "$expected_es_nodes" + + if [[ "$all_searchnodes_compatible" == true ]]; then + echo "All Searchnodes are upgradable to Elasticsearch $target_es_version." + return 0 + fi + + echo "One or more Searchnodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + done + + return 1 + } + + # Gather heavynode version info and verify that each node is running a version compatible with the target ES version. + verify_heavynodes_es_target_compatibility() { + local heavynode_minions attempt + local retries=20 + local retry_count=0 + local delay=180 + local heavynode_discovery_success=false + HEAVYNODE_ES_VERSIONS="" + + for attempt in {1..3}; do + if heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then + heavynode_discovery_success=true + break + fi + + echo "Failed to retrieve grid heavynodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3." + sleep 30 + done + + if [[ "$heavynode_discovery_success" != "true" ]]; then + echo "Failed to retrieve grid heavynodes via salt-key." + return 1 + fi + + if [[ -z "$heavynode_minions" ]]; then + echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check." + return 0 + fi + + while [[ $retry_count -lt $retries ]]; do + HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null) + local exit_status=$? + + if [[ $exit_status -ne 0 ]]; then + echo "Failed to retrieve Elasticsearch version from one or more heavynodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + continue + fi + + local all_heavynodes_compatible=true + while IFS=$'\t' read -r node current_version; do + [[ -z "$node" ]] && continue + if ! es_version_can_upgrade_to_target "$current_version"; then + echo "Heavynode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version." + all_heavynodes_compatible=false + fi + done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv') + + while IFS= read -r heavynode_minion; do + [[ -z "$heavynode_minion" ]] && continue + if ! echo "$HEAVYNODE_ES_VERSIONS" | jq -se --arg minion "$heavynode_minion" 'add | has($minion)' > /dev/null; then + echo "Heavynode $heavynode_minion did not report an Elasticsearch version. It may be offline or still upgrading." + all_heavynodes_compatible=false + fi + done <<< "$heavynode_minions" + + if [[ "$all_heavynodes_compatible" == true ]]; then + echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version." + return 0 + fi + + echo "One or more heavynodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries." + ((retry_count++)) + sleep $delay + done + + return 1 + } + + if [[ ! -f "$es_verification_script" ]]; then + create_intermediate_upgrade_verification_script "$es_verification_script" + fi + for statefile in "${es_required_version_statefile_base}"-*; do [[ -f $statefile ]] || continue @@ -1012,10 +1240,6 @@ verify_es_version_compatibility() { continue fi - if [[ ! -f "$es_verification_script" ]]; then - create_intermediate_upgrade_verification_script "$es_verification_script" - fi - echo -e "\n##############################################################################################################################\n" echo "A previously required intermediate Elasticsearch upgrade was detected. Verifying that all Searchnodes/Heavynodes have successfully upgraded Elasticsearch to $es_required_version_statefile_value before proceeding with soup to avoid potential data loss! This command can take up to an hour to complete." if ! timeout --foreground 4000 bash "$es_verification_script" "$es_required_version_statefile_value" "$statefile"; then @@ -1037,6 +1261,26 @@ verify_es_version_compatibility() { # shellcheck disable=SC2076 # Do not want a regex here eg usage " 8.18.8 9.0.8 " =~ " 9.0.8 " if [[ " ${es_upgrade_map[$es_version]} " =~ " $target_es_version " || "$es_version" == "$target_es_version" ]]; then + if ! verify_searchnodes_es_target_compatibility || ! verify_heavynodes_es_target_compatibility; then + echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + + echo "One or more Searchnode(s)/Heavynode(s) cannot upgrade directly to Elasticsearch $target_es_version. This can happen with soups that include Elasticsearch upgrades being run in quick succession. Typically, this will resolve itself as the grid synchronizes. Please allow time for all Searchnodes/Heavynodes to have upgraded Elasticsearch to a compatible version with $target_es_version before running soup again to avoid potential data loss!" + + if [[ -n "$HEAVYNODE_ES_VERSIONS" ]]; then + echo "Current heavynode Elasticsearch versions:" + echo "$HEAVYNODE_ES_VERSIONS" | jq '.' + fi + + if [[ -n "$SEARCHNODE_ES_VERSIONS" ]]; then + echo "Current searchnode Elasticsearch versions:" + echo "$SEARCHNODE_ES_VERSIONS" | jq '.nodes | to_entries | map({(.value.name): .value.version}) | sort | add' + fi + + echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + + exit 161 + fi + # supported upgrade return 0 else @@ -1322,7 +1566,13 @@ EOF # Keeping this block in case we need to do a hotfix that requires salt update apply_hotfix() { - echo "No actions required. ($INSTALLEDVERSION/$HOTFIXVERSION)" + if [[ "$INSTALLEDVERSION" == "3.1.0" ]] ; then + # Do not remove this fix_logstash_0013_lumberjack_pipeline_name in future hotfixes without first validating older + # installs referencing "so/0013_input_lumberjack_fleet.conf" via pillar are upgradable + fix_logstash_0013_lumberjack_pipeline_name + else + echo "No actions required. ($INSTALLEDVERSION/$HOTFIXVERSION)" + fi } failed_soup_restore_items() { @@ -1394,7 +1644,7 @@ main() { echo "Verifying we have the latest soup script." verify_latest_update_script - echo "Verifying Elasticsearch version compatibility before upgrading." + echo "Verifying Elasticsearch version compatibility across the grid before upgrading." verify_es_version_compatibility echo "Let's see if we need to update Security Onion." diff --git a/salt/postgres/telegraf_users.sls b/salt/postgres/telegraf_users.sls index 28d9d6247..5e3566a95 100644 --- a/salt/postgres/telegraf_users.sls +++ b/salt/postgres/telegraf_users.sls @@ -18,26 +18,12 @@ include: {% set TG_OUT = TELEGRAFMERGED.output | upper %} {% if TG_OUT in ['POSTGRES', 'BOTH'] %} -# docker_container.running returns as soon as the container starts, but on -# first-init docker-entrypoint.sh starts a temporary postgres with -# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then -# shuts it down before exec'ing the real CMD. A default pg_isready check -# (Unix socket) passes during that ephemeral phase and races the shutdown -# with "the database system is shutting down". Checking TCP readiness on -# 127.0.0.1 only succeeds after the final postgres binds the port. postgres_wait_ready: cmd.run: - - name: | - for i in $(seq 1 60); do - if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then - exit 0 - fi - sleep 2 - done - echo "so-postgres did not accept TCP connections within 120s" >&2 - exit 1 + - name: /usr/sbin/so-postgres-wait - require: - docker_container: so-postgres + - file: postgres_sbin # Ensure the shared Telegraf database exists. init-db.sh only runs on a # fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume diff --git a/salt/postgres/tools/sbin/so-postgres-backup b/salt/postgres/tools/sbin/so-postgres-backup index 9db522336..08a73e3a4 100644 --- a/salt/postgres/tools/sbin/so-postgres-backup +++ b/salt/postgres/tools/sbin/so-postgres-backup @@ -7,15 +7,29 @@ . /usr/sbin/so-common +# Without pipefail, a pipeline's exit status is gzip's. A failed pg_dumpall would +# otherwise be masked by a successful gzip, silently producing a valid .gz that +# holds a truncated dump. +set -o pipefail + # Backups contain role password hashes and full chat data; keep them 0600. umask 0077 TODAY=$(date '+%Y_%m_%d') BACKUPDIR=/nsm/backup BACKUPFILE="$BACKUPDIR/so-postgres-backup-$TODAY.sql.gz" +TMPFILE="$BACKUPFILE.tmp" MAXBACKUPS=7 +LOGFILE=/opt/so/log/postgres/backup.log -mkdir -p $BACKUPDIR +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOGFILE" +} + +mkdir -p "$BACKUPDIR" + +# Remove any temp files left behind by a previously crashed run +rm -f "$BACKUPDIR"/so-postgres-backup-*.sql.gz.tmp # Skip if already backed up today if [ -f "$BACKUPFILE" ]; then @@ -27,13 +41,33 @@ if ! docker ps --format '{{.Names}}' | grep -q '^so-postgres$'; then exit 0 fi -# Dump all databases and roles, compress -docker exec so-postgres pg_dumpall -U postgres | gzip > "$BACKUPFILE" +# Always clean up the temp file on exit; the success path clears this trap +# after the atomic rename so the finished backup is not deleted. +trap 'rm -f "$TMPFILE"' EXIT -# Retention cleanup -NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l) +# Dump all databases and roles, compress. Write to a temp file so the final +# filename only ever appears for a complete, verified backup. +if ! docker exec so-postgres pg_dumpall -U postgres | gzip > "$TMPFILE"; then + log "ERROR: pg_dumpall/gzip failed; backup aborted" + exit 1 +fi + +# Verify the compressed stream is intact before publishing it +if ! gzip -t "$TMPFILE"; then + log "ERROR: backup failed gzip integrity check; backup aborted" + exit 1 +fi + +# Atomically publish the verified backup +mv "$TMPFILE" "$BACKUPFILE" +trap - EXIT +log "OK: wrote $BACKUPFILE" + +# Retention cleanup (only reached after a successful backup). The glob is +# restricted to finished backups so an in-progress .tmp can never be counted. +NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l) while [ "$NUMBACKUPS" -gt "$MAXBACKUPS" ]; do - OLDEST=$(find $BACKUPDIR -type f -name "so-postgres-backup*" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}') + OLDEST=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}') rm -f "$OLDEST" - NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l) + NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l) done diff --git a/salt/postgres/tools/sbin/so-postgres-wait b/salt/postgres/tools/sbin/so-postgres-wait new file mode 100644 index 000000000..7c4c8ce92 --- /dev/null +++ b/salt/postgres/tools/sbin/so-postgres-wait @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Wait for the so-postgres container to accept TCP connections. +# +# docker_container.running returns as soon as the container starts, but on +# first-init docker-entrypoint.sh starts a temporary postgres with +# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then +# shuts it down before exec'ing the real CMD. A default pg_isready check +# (Unix socket) passes during that ephemeral phase and races the shutdown +# with "the database system is shutting down". Checking TCP readiness on +# 127.0.0.1 only succeeds after the final postgres binds the port. +# +# Usage: so-postgres-wait [iterations] [sleep_seconds] +# Default: 60 iterations, 2s sleep (~120s total). + +ITERATIONS=${1:-60} +SLEEP_SECONDS=${2:-2} + +for i in $(seq 1 "$ITERATIONS"); do + if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then + exit 0 + fi + sleep "$SLEEP_SECONDS" +done + +echo "so-postgres did not accept TCP connections within $((ITERATIONS * SLEEP_SECONDS))s" >&2 +exit 1 diff --git a/salt/soc/defaults.yaml b/salt/soc/defaults.yaml index cc80758fc..62b451bec 100644 --- a/salt/soc/defaults.yaml +++ b/salt/soc/defaults.yaml @@ -1519,6 +1519,12 @@ soc: serviceAccountJSON: "" serviceAccountLocation: "" healthTimeoutSeconds: 5 + onionconfig: + saltstackDir: /opt/so/saltstack + bypassEnabled: false + postgres: + host: + password: salt: queueDir: /opt/sensoroni/queue timeoutMs: 45000 diff --git a/salt/soc/merged.map.jinja b/salt/soc/merged.map.jinja index 349937983..b34efb11d 100644 --- a/salt/soc/merged.map.jinja +++ b/salt/soc/merged.map.jinja @@ -16,6 +16,13 @@ {% do SOCMERGED.config.server.update({'additionalCA': MANAGERMERGED.additionalCA}) %} {% do SOCMERGED.config.server.update({'insecureSkipVerify': MANAGERMERGED.insecureSkipVerify}) %} +{% if not SOCMERGED.config.server.modules.postgres.host %} +{% do SOCMERGED.config.server.modules.postgres.update({'host': GLOBALS.manager}) %} +{% endif %} +{% if not SOCMERGED.config.server.modules.postgres.password %} +{% do SOCMERGED.config.server.modules.postgres.update({'password': salt['pillar.get']('secrets:postgres_pass', '')}) %} +{% endif %} + {# if SOCMERGED.config.server.modules.cases == httpcase details come from the soc pillar #} {% if SOCMERGED.config.server.modules.cases != 'soc' %} {% do SOCMERGED.config.server.modules.elastic.update({'casesEnabled': false}) %} diff --git a/salt/soc/soc_soc.yaml b/salt/soc/soc_soc.yaml index 647bdd778..3cb244eed 100644 --- a/salt/soc/soc_soc.yaml +++ b/salt/soc/soc_soc.yaml @@ -453,6 +453,26 @@ soc: description: Duration (in milliseconds) that must elapse after a grid node fails to check-in before the node will be marked offline (fault). global: True advanced: True + onionconfig: + saltstackDir: + description: Root directory containing the SaltStack tree that SOC reads and writes configuration from. Should not be changed under normal circumstances. + global: True + advanced: True + bypassEnabled: + description: When enabled, errors encountered while reading the SaltStack pillar tree (missing files, unreadable directories, etc.) are logged but do not prevent SOC from starting or serving settings. Intended for advanced troubleshooting and recovery scenarios when the pillar tree is partially unreadable. + global: True + advanced: True + forcedType: bool + postgres: + host: + description: Hostname or IP address of the PostgreSQL server used by SOC. Defaults to the manager hostname. + global: True + advanced: True + password: + description: Password used by SOC to authenticate to the PostgreSQL server. Defaults to the postgres superuser password seeded in the secrets pillar. + global: True + sensitive: True + advanced: True salt: longRelayTimeoutMs: description: Duration (in milliseconds) to wait for a response from the Salt API when executing tasks known for being long running before giving up and showing an error on the SOC UI. diff --git a/sigs/securityonion-3.1.0-20260521.iso.sig b/sigs/securityonion-3.1.0-20260521.iso.sig new file mode 100644 index 000000000..af7564315 Binary files /dev/null and b/sigs/securityonion-3.1.0-20260521.iso.sig differ diff --git a/sigs/securityonion-3.1.0-20260528.iso.sig b/sigs/securityonion-3.1.0-20260528.iso.sig new file mode 100644 index 000000000..e4bead44d Binary files /dev/null and b/sigs/securityonion-3.1.0-20260528.iso.sig differ