Merge remote-tracking branch 'origin/3/dev' into nostartupstates

This commit is contained in:
Josh Patterson
2026-06-03 15:44:41 -04:00
22 changed files with 535 additions and 133 deletions
+1
View File
@@ -11,6 +11,7 @@ body:
-
- 3.0.0
- 3.1.0
- 3.2.0
- Other (please provide detail below)
validations:
required: true
+11 -11
View File
@@ -1,17 +1,17 @@
### 3.0.0-20260331 ISO image released on 2026/03/31
### 3.1.0-20260528 ISO image released on 2026/05/28
### Download and Verify
3.0.0-20260331 ISO image:
https://download.securityonion.net/file/securityonion/securityonion-3.0.0-20260331.iso
3.1.0-20260528 ISO image:
https://download.securityonion.net/file/securityonion/securityonion-3.1.0-20260528.iso
MD5: ECD318A1662A6FDE0EF213F5A9BD4B07
SHA1: E55BE314440CCF3392DC0B06BC5E270B43176D9C
SHA256: 7FC47405E335CBE5C2B6C51FE7AC60248F35CBE504907B8B5A33822B23F8F4D5
MD5: 9D6FF58DEEE24089D722C73169765B3E
SHA1: 2B8B816B6CEC3B7F96B3C5E040EBF502DD2C412F
SHA256: 62FAB57E247C843D6A04F0796D8162C732B65D82FC3E4A59D087135B9FD32912
Signature for ISO image:
https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.0.0-20260331.iso.sig
https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.1.0-20260528.iso.sig
Signing key:
https://raw.githubusercontent.com/Security-Onion-Solutions/securityonion/3/main/KEYS
@@ -25,22 +25,22 @@ wget https://raw.githubusercontent.com/Security-Onion-Solutions/securityonion/3/
Download the signature file for the ISO:
```
wget https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.0.0-20260331.iso.sig
wget https://github.com/Security-Onion-Solutions/securityonion/raw/3/main/sigs/securityonion-3.1.0-20260528.iso.sig
```
Download the ISO image:
```
wget https://download.securityonion.net/file/securityonion/securityonion-3.0.0-20260331.iso
wget https://download.securityonion.net/file/securityonion/securityonion-3.1.0-20260528.iso
```
Verify the downloaded ISO image using the signature file:
```
gpg --verify securityonion-3.0.0-20260331.iso.sig securityonion-3.0.0-20260331.iso
gpg --verify securityonion-3.1.0-20260528.iso.sig securityonion-3.1.0-20260528.iso
```
The output should show "Good signature" and the Primary key fingerprint should match what's shown below:
```
gpg: Signature made Mon 30 Mar 2026 06:22:14 PM EDT using RSA key ID FE507013
gpg: Signature made Wed 27 May 2026 03:03:59 PM EDT using RSA key ID FE507013
gpg: Good signature from "Security Onion Solutions, LLC <info@securityonionsolutions.com>"
gpg: WARNING: This key is not certified with a trusted signature!
gpg: There is no indication that the signature belongs to the owner.
+1
View File
@@ -0,0 +1 @@
+1 -1
View File
@@ -1 +1 @@
3.1.0
3.2.0
@@ -25,9 +25,11 @@ if [ ! -f $BACKUPFILE ]; then
# Create empty backup file
tar -cf $BACKUPFILE -T /dev/null
# Loop through all paths defined in global.sls, and append them to backup file
# Loop through all paths defined in global.sls, and append them to backup file if they exist
{%- for LOCATION in BACKUPLOCATIONS %}
tar -rf $BACKUPFILE "${EXCLUSIONS[@]}" {{ LOCATION }}
if [[ -d {{ LOCATION }} || -f {{ LOCATION }} ]]; then
tar -rf $BACKUPFILE "${EXCLUSIONS[@]}" {{ LOCATION }}
fi
{%- endfor %}
fi
+2
View File
@@ -165,6 +165,8 @@ if [[ $EXCLUDE_FALSE_POSITIVE_ERRORS == 'Y' ]]; then
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|upgrading component template" # false positive (elasticsearch index or template names contain 'error')
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|upgrading composable template" # false positive (elasticsearch composable template names contain 'error')
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Error while parsing document for index \[.ds-logs-kratos-so-.*object mapping for \[file\]" # false positive (mapping error occuring BEFORE kratos index has rolled over in 2.4.210)
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|No such container" # false positive (telegraf trying to run stats on an old container)
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|passwords do not match" # false positive (automated hydra test)
fi
if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then
+2
View File
@@ -26,7 +26,9 @@ include:
wait_for_elasticsearch_elasticfleet:
cmd.run:
- name: so-elasticsearch-wait
{% endif %}
{% if GLOBALS.role == "so-fleet" %}
# Sync Elastic Agent artifacts to Fleet Node
elasticagent_syncartifacts:
file.recurse:
+1 -1
View File
@@ -103,7 +103,7 @@ kratos:
config:
session:
lifespan:
description: Defines the length of a login session.
description: Defines the length of a login session before it will timeout, and require a new login.
global: True
helpLink: kratos
whoami:
+263 -19
View File
@@ -188,13 +188,6 @@ airgap_update_dockers() {
fi
}
backup_old_states_pillars() {
tar czf /nsm/backup/$(echo $INSTALLEDVERSION)_$(date +%Y%m%d-%H%M%S)_soup_default_states_pillars.tar.gz /opt/so/saltstack/default/
tar czf /nsm/backup/$(echo $INSTALLEDVERSION)_$(date +%Y%m%d-%H%M%S)_soup_local_states_pillars.tar.gz /opt/so/saltstack/local/
}
update_registry() {
docker stop so-dockerregistry
docker rm so-dockerregistry
@@ -370,8 +363,9 @@ preupgrade_changes() {
# This function is to add any new pillar items if needed.
echo "Checking to see if changes are needed."
[[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0
[[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0
[[ "$INSTALLEDVERSION" == "3.0.0" ]] && up_to_3.1.0
[[ "$INSTALLEDVERSION" == "3.1.0" ]] && up_to_3.2.0
true
}
@@ -381,6 +375,7 @@ postupgrade_changes() {
[[ "$POSTVERSION" =~ ^2\.4\.21[0-9]+$ ]] && post_to_3.0.0
[[ "$POSTVERSION" == "3.0.0" ]] && post_to_3.1.0
[[ "$POSTVERSION" == "3.1.0" ]] && post_to_3.2.0
true
}
@@ -533,6 +528,23 @@ elasticfleet_set_agent_logging_level_warn() {
done <<< "$policies_to_update"
}
update_logstash_pipeline_name() {
local original_pipeline_name="$1"
local new_pipeline_name="$2"
echo "Checking for conflicting logstash defined_pipelines pillar value."
local LOGSTASH_FILE=/opt/so/saltstack/local/pillar/logstash/soc_logstash.sls
local MINIONDIR=/opt/so/saltstack/local/pillar/minions
for pillar_file in "$LOGSTASH_FILE" "$MINIONDIR"/*.sls; do
[[ -f "$pillar_file" ]] || continue
if grep -q "$original_pipeline_name$" "$pillar_file"; then
echo "Found conflicting defined_pipeline pillar value in $pillar_file. Updating to use the new logstash pipeline name."
sed -i "s#$original_pipeline_name\$#$new_pipeline_name#g" "$pillar_file"
chown socore:socore "$pillar_file"
fi
done
}
check_transform_health_and_reauthorize() {
. /usr/sbin/so-elastic-fleet-common
@@ -676,6 +688,10 @@ rename_strelka_scan_lnk() {
rm -f "$TMP_VALUE_FILE"
}
fix_logstash_0013_lumberjack_pipeline_name() {
update_logstash_pipeline_name "so/0013_input_lumberjack_fleet.conf" "so/0013_input_lumberjack_fleet.conf.jinja"
}
up_to_3.1.0() {
ensure_postgres_local_pillar
ensure_postgres_secret
@@ -684,6 +700,7 @@ up_to_3.1.0() {
# Clear existing component template state file.
rm -f /opt/so/state/esfleet_component_templates.json
rename_strelka_scan_lnk
fix_logstash_0013_lumberjack_pipeline_name
INSTALLEDVERSION=3.1.0
}
@@ -720,6 +737,48 @@ post_to_3.1.0() {
### 3.1.0 End ###
### 3.2.0 Scripts ###
bootstrap_so_soc_database() {
# init-db.sh is mounted into so-postgres at /docker-entrypoint-initdb.d/init-db.sh
# and runs automatically only on a fresh data directory. Hosts upgrading from
# 3.1.0 already have /nsm/postgres populated, so the so_soc bootstrap block
# added in 3.2 never fires. Re-run the script explicitly; it's idempotent.
echo "Bootstrapping so_soc database via init-db.sh."
# The postgres image has no USER directive, so `docker exec` defaults to
# root, and the container env intentionally omits POSTGRES_USER (the upstream
# entrypoint defaults it transiently during first-init only). Recreate both
# so psql inside init-db.sh resolves the connect user correctly.
local exec_cmd="docker exec -u postgres -e POSTGRES_USER=postgres so-postgres bash /docker-entrypoint-initdb.d/init-db.sh"
if ! /usr/sbin/so-postgres-wait; then
FINAL_MESSAGE_QUEUE+=("WARNING: so-postgres was not ready during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd")
return 0
fi
if ! $exec_cmd; then
FINAL_MESSAGE_QUEUE+=("WARNING: init-db.sh failed inside so-postgres during the 3.2.0 upgrade; the so_soc database may not have been bootstrapped. Re-run manually: $exec_cmd")
return 0
fi
echo "so_soc bootstrap complete."
}
up_to_3.2.0() {
fix_logstash_0013_lumberjack_pipeline_name
INSTALLEDVERSION=3.2.0
}
post_to_3.2.0() {
bootstrap_so_soc_database
# Including agent regen script here since it was missed in post_to_3.1.0
echo "Regenerating Elastic Agent Installers"
/sbin/so-elastic-agent-gen-installers
POSTVERSION=3.2.0
}
### 3.2.0 End ###
repo_sync() {
echo "Sync the local repo."
@@ -971,6 +1030,9 @@ verify_es_version_compatibility() {
local is_active_intermediate_upgrade=1
# supported upgrade paths for SO-ES versions
declare -A es_upgrade_map=(
["8.18.4"]="8.18.6 8.18.8 9.0.8"
["8.18.6"]="8.18.8 9.0.8"
["8.18.8"]="9.0.8"
["9.0.8"]="9.3.3"
)
@@ -994,6 +1056,171 @@ verify_es_version_compatibility() {
exit 160
fi
compatible_es_versions="$target_es_version"
for current_version in "${!es_upgrade_map[@]}"; do
# shellcheck disable=SC2076
if [[ " ${es_upgrade_map[$current_version]} " =~ " $target_es_version " ]]; then
compatible_es_versions+=" $current_version"
fi
done
# Check if the given ES version can directly upgrade to the target ES version. Used to assist with catching lagging nodes during the upgrade process
es_version_can_upgrade_to_target() {
local current_version="$1"
# shellcheck disable=SC2076
if [[ -n "$current_version" && " $compatible_es_versions " =~ " $current_version " ]]; then
return 0
fi
return 1
}
# Gather Elasticsearch cluster version info and verify that each node in the cluster is running a version compatible with the target ES version.
verify_searchnodes_es_target_compatibility() {
local retries=20
local retry_count=0
local delay=180
local expected_es_nodes searchnode_minions attempt
local searchnode_discovery_success=false
SEARCHNODE_ES_VERSIONS=""
for attempt in {1..3}; do
if searchnode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("searchnode"))'); then
searchnode_discovery_success=true
break
fi
echo "Failed to retrieve grid searchnodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3."
sleep 30
done
if [[ "$searchnode_discovery_success" != "true" ]]; then
echo "Failed to retrieve grid searchnodes via salt-key."
return 1
fi
# Always add node running soup to expected es nodes
expected_es_nodes="${MINIONID%_*}"
while IFS= read -r searchnode_minion; do
[[ -z "$searchnode_minion" ]] && continue
expected_es_nodes+=$'\n'"${searchnode_minion%_searchnode}"
done <<< "$searchnode_minions"
while [[ $retry_count -lt $retries ]]; do
SEARCHNODE_ES_VERSIONS=$(so-elasticsearch-query _nodes/_all/version --retry 5 --retry-delay 10 --fail 2>&1)
local exit_status=$?
if [[ $exit_status -ne 0 ]]; then
echo "Failed to retrieve Elasticsearch versions from searchnodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries."
((retry_count++))
sleep $delay
continue
fi
local all_searchnodes_compatible=true
while IFS=$'\t' read -r node current_version; do
[[ -z "$node" ]] && continue
if ! es_version_can_upgrade_to_target "$current_version"; then
echo "Searchnode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version."
all_searchnodes_compatible=false
fi
done < <(echo "$SEARCHNODE_ES_VERSIONS" | jq -r '.nodes | to_entries[] | [.value.name, .value.version] | @tsv')
while IFS= read -r expected_es_node; do
[[ -z "$expected_es_node" ]] && continue
if ! echo "$SEARCHNODE_ES_VERSIONS" | jq -e --arg node "$expected_es_node" '.nodes | to_entries | any(.value.name == $node)' > /dev/null; then
echo "Searchnode $expected_es_node did not report an Elasticsearch version. It may be offline or still upgrading."
all_searchnodes_compatible=false
fi
done <<< "$expected_es_nodes"
if [[ "$all_searchnodes_compatible" == true ]]; then
echo "All Searchnodes are upgradable to Elasticsearch $target_es_version."
return 0
fi
echo "One or more Searchnodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries."
((retry_count++))
sleep $delay
done
return 1
}
# Gather heavynode version info and verify that each node is running a version compatible with the target ES version.
verify_heavynodes_es_target_compatibility() {
local heavynode_minions attempt
local retries=20
local retry_count=0
local delay=180
local heavynode_discovery_success=false
HEAVYNODE_ES_VERSIONS=""
for attempt in {1..3}; do
if heavynode_minions=$(set -o pipefail; salt-key --out=json --list=accepted 2> /dev/null | jq -r '.minions[]? | select(endswith("heavynode"))'); then
heavynode_discovery_success=true
break
fi
echo "Failed to retrieve grid heavynodes via salt-key... Retrying in 30 seconds. Attempt $attempt of 3."
sleep 30
done
if [[ "$heavynode_discovery_success" != "true" ]]; then
echo "Failed to retrieve grid heavynodes via salt-key."
return 1
fi
if [[ -z "$heavynode_minions" ]]; then
echo "No heavynodes detected. Skipping heavynode Elasticsearch version compatibility check."
return 0
fi
while [[ $retry_count -lt $retries ]]; do
HEAVYNODE_ES_VERSIONS=$(salt -C 'G@role:so-heavynode' cmd.run 'set -o pipefail; so-elasticsearch-query / --retry 5 --retry-delay 10 | jq -er ".version.number"' shell=/bin/bash --out=json 2> /dev/null)
local exit_status=$?
if [[ $exit_status -ne 0 ]]; then
echo "Failed to retrieve Elasticsearch version from one or more heavynodes... Retrying in $delay seconds. Attempt $((retry_count + 1)) of $retries."
((retry_count++))
sleep $delay
continue
fi
local all_heavynodes_compatible=true
while IFS=$'\t' read -r node current_version; do
[[ -z "$node" ]] && continue
if ! es_version_can_upgrade_to_target "$current_version"; then
echo "Heavynode $node is running Elasticsearch $current_version, which is not directly upgradable to Elasticsearch $target_es_version."
all_heavynodes_compatible=false
fi
done < <(echo "$HEAVYNODE_ES_VERSIONS" | jq -r 'to_entries[] | [.key, .value] | @tsv')
while IFS= read -r heavynode_minion; do
[[ -z "$heavynode_minion" ]] && continue
if ! echo "$HEAVYNODE_ES_VERSIONS" | jq -se --arg minion "$heavynode_minion" 'add | has($minion)' > /dev/null; then
echo "Heavynode $heavynode_minion did not report an Elasticsearch version. It may be offline or still upgrading."
all_heavynodes_compatible=false
fi
done <<< "$heavynode_minions"
if [[ "$all_heavynodes_compatible" == true ]]; then
echo -e "\nAll heavynodes can upgrade to Elasticsearch $target_es_version."
return 0
fi
echo "One or more heavynodes cannot upgrade directly to Elasticsearch $target_es_version. Rechecking in $delay seconds. Attempt $((retry_count + 1)) of $retries."
((retry_count++))
sleep $delay
done
return 1
}
if [[ ! -f "$es_verification_script" ]]; then
create_intermediate_upgrade_verification_script "$es_verification_script"
fi
for statefile in "${es_required_version_statefile_base}"-*; do
[[ -f $statefile ]] || continue
@@ -1012,10 +1239,6 @@ verify_es_version_compatibility() {
continue
fi
if [[ ! -f "$es_verification_script" ]]; then
create_intermediate_upgrade_verification_script "$es_verification_script"
fi
echo -e "\n##############################################################################################################################\n"
echo "A previously required intermediate Elasticsearch upgrade was detected. Verifying that all Searchnodes/Heavynodes have successfully upgraded Elasticsearch to $es_required_version_statefile_value before proceeding with soup to avoid potential data loss! This command can take up to an hour to complete."
if ! timeout --foreground 4000 bash "$es_verification_script" "$es_required_version_statefile_value" "$statefile"; then
@@ -1037,6 +1260,26 @@ verify_es_version_compatibility() {
# shellcheck disable=SC2076 # Do not want a regex here eg usage " 8.18.8 9.0.8 " =~ " 9.0.8 "
if [[ " ${es_upgrade_map[$es_version]} " =~ " $target_es_version " || "$es_version" == "$target_es_version" ]]; then
if ! verify_searchnodes_es_target_compatibility || ! verify_heavynodes_es_target_compatibility; then
echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
echo "One or more Searchnode(s)/Heavynode(s) cannot upgrade directly to Elasticsearch $target_es_version. This can happen with soups that include Elasticsearch upgrades being run in quick succession. Typically, this will resolve itself as the grid synchronizes. Please allow time for all Searchnodes/Heavynodes to have upgraded Elasticsearch to a compatible version with $target_es_version before running soup again to avoid potential data loss!"
if [[ -n "$HEAVYNODE_ES_VERSIONS" ]]; then
echo "Current heavynode Elasticsearch versions:"
echo "$HEAVYNODE_ES_VERSIONS" | jq '.'
fi
if [[ -n "$SEARCHNODE_ES_VERSIONS" ]]; then
echo "Current searchnode Elasticsearch versions:"
echo "$SEARCHNODE_ES_VERSIONS" | jq '.nodes | to_entries | map({(.value.name): .value.version}) | sort | add'
fi
echo -e "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
exit 161
fi
# supported upgrade
return 0
else
@@ -1322,7 +1565,7 @@ EOF
# Keeping this block in case we need to do a hotfix that requires salt update
apply_hotfix() {
echo "No actions required. ($INSTALLEDVERSION/$HOTFIXVERSION)"
echo "No actions required. ($INSTALLEDVERSION/$HOTFIXVERSION)"
}
failed_soup_restore_items() {
@@ -1394,13 +1637,13 @@ main() {
echo "Verifying we have the latest soup script."
verify_latest_update_script
echo "Verifying Elasticsearch version compatibility before upgrading."
verify_es_version_compatibility
echo "Let's see if we need to update Security Onion."
upgrade_check
upgrade_space
echo "Verifying Elasticsearch version compatibility across the grid before upgrading."
verify_es_version_compatibility
echo "Checking for Salt Master and Minion updates."
upgrade_check_salt
set -e
@@ -1420,7 +1663,8 @@ main() {
echo "Applying $HOTFIXVERSION hotfix"
# since we don't run the backup.config_backup state on import we wont snapshot previous version states and pillars
if [[ ! "$MINION_ROLE" == "import" ]]; then
backup_old_states_pillars
echo "Running so-config-backup script."
/sbin/so-config-backup
fi
copy_new_files
create_local_directories "/opt/so/saltstack/default"
@@ -1476,8 +1720,8 @@ main() {
# since we don't run the backup.config_backup state on import we wont snapshot previous version states and pillars
if [[ ! "$MINION_ROLE" == "import" ]]; then
echo ""
echo "Creating snapshots of default and local Salt states and pillars and saving to /nsm/backup/"
backup_old_states_pillars
echo "Running so-config-backup script."
/sbin/so-config-backup
fi
echo ""
+2
View File
@@ -225,6 +225,7 @@ http {
limit_req zone=auth_throttle burst={{ NGINXMERGED.config.throttle_login_burst }} nodelay;
limit_req_status 429;
proxy_pass http://{{ GLOBALS.manager }}:4433;
proxy_set_header Connection "Close";
proxy_read_timeout 90;
proxy_connect_timeout 90;
proxy_set_header Host $host;
@@ -237,6 +238,7 @@ http {
location ~ ^/auth/.*?(whoami|logout|settings|errors|webauthn.js) {
rewrite /auth/(.*) /$1 break;
proxy_pass http://{{ GLOBALS.manager }}:4433;
proxy_set_header Connection "Close";
proxy_read_timeout 90;
proxy_connect_timeout 90;
proxy_set_header Host $host;
+3 -3
View File
@@ -46,10 +46,10 @@ postgresinitdir:
- require:
- file: postgresconfdir
postgresinitusers:
postgresinitdb:
file.managed:
- name: /opt/so/conf/postgres/init/init-users.sh
- source: salt://postgres/files/init-users.sh
- name: /opt/so/conf/postgres/init/init-db.sh
- source: salt://postgres/files/init-db.sh
- user: 939
- group: 939
- mode: 755
+4 -4
View File
@@ -31,7 +31,7 @@ so-postgres:
- POSTGRES_DB=securityonion
# Passwords are delivered via mounted 0600 secret files, not plaintext env vars.
# The upstream postgres image resolves POSTGRES_PASSWORD_FILE; entrypoint.sh and
# init-users.sh resolve SO_POSTGRES_PASS_FILE the same way.
# init-db.sh resolve SO_POSTGRES_PASS_FILE the same way.
- POSTGRES_PASSWORD_FILE=/run/secrets/postgres_password
- SO_POSTGRES_USER={{ SO_POSTGRES_USER }}
- SO_POSTGRES_PASS_FILE=/run/secrets/so_postgres_pass
@@ -46,7 +46,7 @@ so-postgres:
- /opt/so/conf/postgres/postgresql.conf:/conf/postgresql.conf:ro
- /opt/so/conf/postgres/pg_hba.conf:/conf/pg_hba.conf:ro
- /opt/so/conf/postgres/secrets:/run/secrets:ro
- /opt/so/conf/postgres/init/init-users.sh:/docker-entrypoint-initdb.d/init-users.sh:ro
- /opt/so/conf/postgres/init/init-db.sh:/docker-entrypoint-initdb.d/init-db.sh:ro
- /etc/pki/postgres.crt:/conf/postgres.crt:ro
- /etc/pki/postgres.key:/conf/postgres.key:ro
- /etc/pki/tls/certs/intca.crt:/conf/ca.crt:ro
@@ -70,7 +70,7 @@ so-postgres:
- watch:
- file: postgresconf
- file: postgreshba
- file: postgresinitusers
- file: postgresinitdb
- file: postgres_super_secret
- file: postgres_app_secret
- x509: postgres_crt
@@ -78,7 +78,7 @@ so-postgres:
- require:
- file: postgresconf
- file: postgreshba
- file: postgresinitusers
- file: postgresinitdb
- file: postgres_super_secret
- file: postgres_app_secret
- x509: postgres_crt
@@ -32,3 +32,8 @@ EOSQL
if ! psql -U "$POSTGRES_USER" -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -c "CREATE DATABASE so_telegraf"
fi
# Bootstrap the SOC database.
if ! psql -U "$POSTGRES_USER" -tAc "SELECT 1 FROM pg_database WHERE datname='so_soc'" | grep -q 1; then
psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -c "CREATE DATABASE so_soc"
fi
+18 -85
View File
@@ -18,38 +18,22 @@ include:
{% set TG_OUT = TELEGRAFMERGED.output | upper %}
{% if TG_OUT in ['POSTGRES', 'BOTH'] %}
# docker_container.running returns as soon as the container starts, but on
# first-init docker-entrypoint.sh starts a temporary postgres with
# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
# shuts it down before exec'ing the real CMD. A default pg_isready check
# (Unix socket) passes during that ephemeral phase and races the shutdown
# with "the database system is shutting down". Checking TCP readiness on
# 127.0.0.1 only succeeds after the final postgres binds the port.
postgres_wait_ready:
cmd.run:
- name: |
for i in $(seq 1 60); do
if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
exit 0
fi
sleep 2
done
echo "so-postgres did not accept TCP connections within 120s" >&2
exit 1
- name: /usr/sbin/so-postgres-wait
- require:
- docker_container: so-postgres
- file: postgres_sbin
# Ensure the shared Telegraf database exists. init-users.sh only runs on a
# Ensure the shared Telegraf database exists. init-db.sh only runs on a
# fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
# would otherwise never get so_telegraf.
postgres_create_telegraf_db:
cmd.run:
- name: |
if ! docker exec so-postgres psql -U postgres -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
docker exec so-postgres psql -v ON_ERROR_STOP=1 -U postgres -c "CREATE DATABASE so_telegraf"
fi
- name: /usr/sbin/so-telegraf-postgres create_db
- require:
- cmd: postgres_wait_ready
- file: postgres_sbin
# Provision the shared group role and schema once. Every per-minion role is a
# member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf
@@ -57,68 +41,26 @@ postgres_create_telegraf_db:
# on first write are owned by the group role and every member can INSERT/SELECT.
postgres_telegraf_group_role:
cmd.run:
- name: |
docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN
CREATE ROLE so_telegraf NOLOGIN;
END IF;
END
$$;
GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf;
CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf;
GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf;
CREATE SCHEMA IF NOT EXISTS partman;
CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
CREATE EXTENSION IF NOT EXISTS pg_cron;
-- Telegraf (running as so_telegraf) calls partman.create_parent()
-- on first write of each metric, which needs USAGE on the partman
-- schema, EXECUTE on its functions/procedures, and write access to
-- partman.part_config so it can register new partitioned parents.
GRANT USAGE, CREATE ON SCHEMA partman TO so_telegraf;
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA partman TO so_telegraf;
GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA partman TO so_telegraf;
GRANT EXECUTE ON ALL PROCEDURES IN SCHEMA partman TO so_telegraf;
-- partman creates per-parent template tables (partman.template_*) at
-- runtime; default privileges extend DML/sequence access to them.
ALTER DEFAULT PRIVILEGES IN SCHEMA partman
GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO so_telegraf;
ALTER DEFAULT PRIVILEGES IN SCHEMA partman
GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO so_telegraf;
-- Hourly partman maintenance. cron.schedule is idempotent by jobname.
SELECT cron.schedule(
'telegraf-partman-maintenance',
'17 * * * *',
'CALL partman.run_maintenance_proc()'
);
EOSQL
- name: /usr/sbin/so-telegraf-postgres group_role
- require:
- cmd: postgres_create_telegraf_db
- file: postgres_sbin
{% set creds = salt['pillar.get']('telegraf:postgres_creds', {}) %}
{% for mid, entry in creds.items() %}
{% if entry.get('user') and entry.get('pass') %}
{% set u = entry.user %}
{% set p = entry.pass | replace("'", "''") %}
{% set p = entry.pass %}
postgres_telegraf_role_{{ u }}:
cmd.run:
- name: |
docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{{ u }}') THEN
EXECUTE format('CREATE ROLE %I WITH LOGIN PASSWORD %L', '{{ u }}', '{{ p }}');
ELSE
EXECUTE format('ALTER ROLE %I WITH PASSWORD %L', '{{ u }}', '{{ p }}');
END IF;
END
$$;
GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}";
GRANT so_telegraf TO "{{ u }}";
EOSQL
- name: /usr/sbin/so-telegraf-postgres user
- env:
- ROLE_USER: {{ u | tojson }}
- ROLE_PASS: {{ p | tojson }}
- hide_output: True
- require:
- file: postgres_sbin
- cmd: postgres_telegraf_group_role
{% endif %}
@@ -130,21 +72,12 @@ postgres_telegraf_role_{{ u }}:
{% set retention = salt['pillar.get']('postgres:telegraf:retention_days', 14) | int %}
postgres_telegraf_retention_reconcile:
cmd.run:
- name: |
docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM pg_catalog.pg_extension WHERE extname = 'pg_partman') THEN
UPDATE partman.part_config
SET retention = '{{ retention }} days',
retention_keep_table = false
WHERE parent_table LIKE 'telegraf.%';
END IF;
END
$$;
EOSQL
- name: /usr/sbin/so-telegraf-postgres retention
- env:
- RETENTION_DAYS: {{ retention }}
- require:
- cmd: postgres_telegraf_group_role
- file: postgres_sbin
{% endif %}
+41 -7
View File
@@ -7,15 +7,29 @@
. /usr/sbin/so-common
# Without pipefail, a pipeline's exit status is gzip's. A failed pg_dumpall would
# otherwise be masked by a successful gzip, silently producing a valid .gz that
# holds a truncated dump.
set -o pipefail
# Backups contain role password hashes and full chat data; keep them 0600.
umask 0077
TODAY=$(date '+%Y_%m_%d')
BACKUPDIR=/nsm/backup
BACKUPFILE="$BACKUPDIR/so-postgres-backup-$TODAY.sql.gz"
TMPFILE="$BACKUPFILE.tmp"
MAXBACKUPS=7
LOGFILE=/opt/so/log/postgres/backup.log
mkdir -p $BACKUPDIR
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') $*" >> "$LOGFILE"
}
mkdir -p "$BACKUPDIR"
# Remove any temp files left behind by a previously crashed run
rm -f "$BACKUPDIR"/so-postgres-backup-*.sql.gz.tmp
# Skip if already backed up today
if [ -f "$BACKUPFILE" ]; then
@@ -27,13 +41,33 @@ if ! docker ps --format '{{.Names}}' | grep -q '^so-postgres$'; then
exit 0
fi
# Dump all databases and roles, compress
docker exec so-postgres pg_dumpall -U postgres | gzip > "$BACKUPFILE"
# Always clean up the temp file on exit; the success path clears this trap
# after the atomic rename so the finished backup is not deleted.
trap 'rm -f "$TMPFILE"' EXIT
# Retention cleanup
NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
# Dump all databases and roles, compress. Write to a temp file so the final
# filename only ever appears for a complete, verified backup.
if ! docker exec so-postgres pg_dumpall -U postgres | gzip > "$TMPFILE"; then
log "ERROR: pg_dumpall/gzip failed; backup aborted"
exit 1
fi
# Verify the compressed stream is intact before publishing it
if ! gzip -t "$TMPFILE"; then
log "ERROR: backup failed gzip integrity check; backup aborted"
exit 1
fi
# Atomically publish the verified backup
mv "$TMPFILE" "$BACKUPFILE"
trap - EXIT
log "OK: wrote $BACKUPFILE"
# Retention cleanup (only reached after a successful backup). The glob is
# restricted to finished backups so an in-progress .tmp can never be counted.
NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l)
while [ "$NUMBACKUPS" -gt "$MAXBACKUPS" ]; do
OLDEST=$(find $BACKUPDIR -type f -name "so-postgres-backup*" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
OLDEST=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
rm -f "$OLDEST"
NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
NUMBACKUPS=$(find "$BACKUPDIR" -type f -name "so-postgres-backup-*.sql.gz" | wc -l)
done
+32
View File
@@ -0,0 +1,32 @@
#!/bin/bash
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
# Wait for the so-postgres container to accept TCP connections.
#
# docker_container.running returns as soon as the container starts, but on
# first-init docker-entrypoint.sh starts a temporary postgres with
# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
# shuts it down before exec'ing the real CMD. A default pg_isready check
# (Unix socket) passes during that ephemeral phase and races the shutdown
# with "the database system is shutting down". Checking TCP readiness on
# 127.0.0.1 only succeeds after the final postgres binds the port.
#
# Usage: so-postgres-wait [iterations] [sleep_seconds]
# Default: 60 iterations, 2s sleep (~120s total).
ITERATIONS=${1:-60}
SLEEP_SECONDS=${2:-2}
for i in $(seq 1 "$ITERATIONS"); do
if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
exit 0
fi
sleep "$SLEEP_SECONDS"
done
echo "so-postgres did not accept TCP connections within $((ITERATIONS * SLEEP_SECONDS))s" >&2
exit 1
@@ -0,0 +1,110 @@
#!/bin/bash
set -e
# Provision Telegraf state inside the so-postgres container.
# Usage: so-telegraf-postgres <subcommand>
# create_db Ensure the so_telegraf database exists.
# group_role Provision the so_telegraf group role, telegraf/partman schemas,
# pg_partman, pg_cron, and the hourly partman maintenance job.
# user Create or update a per-minion login role granted to so_telegraf.
# Env: ROLE_USER, ROLE_PASS.
# retention Reconcile partman retention on telegraf parents.
# Env: RETENTION_DAYS.
cmd="${1:?subcommand required}"
case "$cmd" in
create_db)
if ! docker exec so-postgres psql -U postgres -tAc \
"SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
docker exec so-postgres psql -v ON_ERROR_STOP=1 -U postgres \
-c "CREATE DATABASE so_telegraf"
fi
;;
group_role)
docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN
CREATE ROLE so_telegraf NOLOGIN;
END IF;
END
$$;
GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf;
CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf;
GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf;
CREATE SCHEMA IF NOT EXISTS partman;
CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
CREATE EXTENSION IF NOT EXISTS pg_cron;
-- Telegraf (running as so_telegraf) calls partman.create_parent()
-- on first write of each metric, which needs USAGE on the partman
-- schema, EXECUTE on its functions/procedures, and write access to
-- partman.part_config so it can register new partitioned parents.
GRANT USAGE, CREATE ON SCHEMA partman TO so_telegraf;
GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA partman TO so_telegraf;
GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA partman TO so_telegraf;
GRANT EXECUTE ON ALL PROCEDURES IN SCHEMA partman TO so_telegraf;
-- partman creates per-parent template tables (partman.template_*) at
-- runtime; default privileges extend DML/sequence access to them.
ALTER DEFAULT PRIVILEGES IN SCHEMA partman
GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO so_telegraf;
ALTER DEFAULT PRIVILEGES IN SCHEMA partman
GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO so_telegraf;
-- Hourly partman maintenance. cron.schedule is idempotent by jobname.
SELECT cron.schedule(
'telegraf-partman-maintenance',
'17 * * * *',
'CALL partman.run_maintenance_proc()'
);
EOSQL
;;
user)
: "${ROLE_USER:?ROLE_USER is required}"
: "${ROLE_PASS:?ROLE_PASS is required}"
# psql does not substitute :vars inside dollar-quoted strings, so the
# conditional CREATE/ALTER is built outside any DO block and dispatched
# with \gexec. format() handles identifier/literal quoting.
docker exec -i so-postgres psql \
-v ON_ERROR_STOP=1 \
-v role_user="$ROLE_USER" \
-v role_pass="$ROLE_PASS" \
-U postgres -d so_telegraf <<'EOSQL'
SELECT format(
CASE WHEN EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = :'role_user')
THEN 'ALTER ROLE %I WITH LOGIN PASSWORD %L'
ELSE 'CREATE ROLE %I WITH LOGIN PASSWORD %L'
END,
:'role_user',
:'role_pass'
) \gexec
GRANT CONNECT ON DATABASE so_telegraf TO :"role_user";
GRANT so_telegraf TO :"role_user";
EOSQL
;;
retention)
: "${RETENTION_DAYS:?RETENTION_DAYS is required}"
# \gset + \if guards against a missing pg_partman without using a DO
# block (psql :var substitution doesn't reach into dollar-quoted code).
docker exec -i so-postgres psql \
-v ON_ERROR_STOP=1 \
-v retention_days="$RETENTION_DAYS" \
-U postgres -d so_telegraf <<'EOSQL'
SELECT CASE WHEN EXISTS (SELECT 1 FROM pg_catalog.pg_extension WHERE extname = 'pg_partman')
THEN 'true' ELSE 'false' END AS has_partman \gset
\if :has_partman
UPDATE partman.part_config
SET retention = :'retention_days' || ' days',
retention_keep_table = false
WHERE parent_table LIKE 'telegraf.%';
\endif
EOSQL
;;
*)
echo "Unknown subcommand: $cmd" >&2
exit 1
;;
esac
+6
View File
@@ -1519,6 +1519,12 @@ soc:
serviceAccountJSON: ""
serviceAccountLocation: ""
healthTimeoutSeconds: 5
onionconfig:
saltstackDir: /opt/so/saltstack
bypassEnabled: false
postgres:
host:
password:
salt:
queueDir: /opt/sensoroni/queue
timeoutMs: 45000
+7
View File
@@ -16,6 +16,13 @@
{% do SOCMERGED.config.server.update({'additionalCA': MANAGERMERGED.additionalCA}) %}
{% do SOCMERGED.config.server.update({'insecureSkipVerify': MANAGERMERGED.insecureSkipVerify}) %}
{% if not SOCMERGED.config.server.modules.postgres.host %}
{% do SOCMERGED.config.server.modules.postgres.update({'host': GLOBALS.manager}) %}
{% endif %}
{% if not SOCMERGED.config.server.modules.postgres.password %}
{% do SOCMERGED.config.server.modules.postgres.update({'password': salt['pillar.get']('secrets:postgres_pass', '')}) %}
{% endif %}
{# if SOCMERGED.config.server.modules.cases == httpcase details come from the soc pillar #}
{% if SOCMERGED.config.server.modules.cases != 'soc' %}
{% do SOCMERGED.config.server.modules.elastic.update({'casesEnabled': false}) %}
+21
View File
@@ -453,6 +453,26 @@ soc:
description: Duration (in milliseconds) that must elapse after a grid node fails to check-in before the node will be marked offline (fault).
global: True
advanced: True
onionconfig:
saltstackDir:
description: Root directory containing the SaltStack tree that SOC reads and writes configuration from. Should not be changed under normal circumstances.
global: True
advanced: True
bypassEnabled:
description: When enabled, errors encountered while reading the SaltStack pillar tree (missing files, unreadable directories, etc.) are logged but do not prevent SOC from starting or serving settings. Intended for advanced troubleshooting and recovery scenarios when the pillar tree is partially unreadable.
global: True
advanced: True
forcedType: bool
postgres:
host:
description: Hostname or IP address of the PostgreSQL server used by SOC. Defaults to the manager hostname.
global: True
advanced: True
password:
description: Password used by SOC to authenticate to the PostgreSQL server. Defaults to the postgres superuser password seeded in the secrets pillar.
global: True
sensitive: True
advanced: True
salt:
longRelayTimeoutMs:
description: Duration (in milliseconds) to wait for a response from the Salt API when executing tasks known for being long running before giving up and showing an error on the SOC UI.
@@ -818,6 +838,7 @@ soc:
description: List of available external tools visible in the SOC UI. Each tool is defined in JSON object notation, and must include the "name" key and "link" key, where the link is the tool's URL.
global: True
advanced: True
multiline: True
forcedType: "[]{}"
exportNodeId:
description: The node ID on which export jobs will be executed.
Binary file not shown.
Binary file not shown.