Allow preconfigured management bond in requirements

Limit management bond setup to ISO installs
Show management bond option consistently
2026-05-06 19:38:51 +02:00 · 2026-05-05 15:35:12 -04:00 · 2026-05-05 15:30:09 -04:00 · 2026-05-05 15:22:40 -04:00 · 2026-05-05 15:13:24 -04:00 · 2026-05-01 14:11:12 -05:00
67 changed files with 2030 additions and 220 deletions
@@ -0,0 +1,12 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Per-minion Telegraf Postgres credentials. so-telegraf-cred on the manager is
+# the single writer; it mutates /opt/so/saltstack/local/pillar/telegraf/creds.sls
+# under flock. Pillar_roots order (local before default) means the populated
+# copy shadows this default on any real grid; this file exists so the pillar
+# key is always defined on fresh installs and when no minions have creds yet.
+telegraf:
+  postgres_creds: {}
@@ -17,6 +17,7 @@ base:
    - sensoroni.adv_sensoroni
    - telegraf.soc_telegraf
    - telegraf.adv_telegraf
+    - telegraf.creds
    - versionlock.soc_versionlock
    - versionlock.adv_versionlock
    - soc.license
@@ -38,6 +39,9 @@ base:
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/elasticsearch/auth.sls') %}
    - elasticsearch.auth
    {% endif %}
+    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/postgres/auth.sls') %}
+    - postgres.auth
+    {% endif %}
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/kibana/secrets.sls') %}
    - kibana.secrets
    {% endif %}
@@ -60,6 +64,8 @@ base:
    - redis.adv_redis
    - influxdb.soc_influxdb
    - influxdb.adv_influxdb
+    - postgres.soc_postgres
+    - postgres.adv_postgres
    - elasticsearch.nodes
    - elasticsearch.soc_elasticsearch
    - elasticsearch.adv_elasticsearch
@@ -100,6 +106,9 @@ base:
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/elasticsearch/auth.sls') %}
    - elasticsearch.auth
    {% endif %}
+    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/postgres/auth.sls') %}
+    - postgres.auth
+    {% endif %}
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/kibana/secrets.sls') %}
    - kibana.secrets
    {% endif %}
@@ -125,6 +134,8 @@ base:
    - redis.adv_redis
    - influxdb.soc_influxdb
    - influxdb.adv_influxdb
+    - postgres.soc_postgres
+    - postgres.adv_postgres
    - backup.soc_backup
    - backup.adv_backup
    - zeek.soc_zeek
@@ -144,6 +155,9 @@ base:
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/elasticsearch/auth.sls') %}
    - elasticsearch.auth
    {% endif %}
+    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/postgres/auth.sls') %}
+    - postgres.auth
+    {% endif %}
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/kibana/secrets.sls') %}
    - kibana.secrets
    {% endif %}
@@ -158,6 +172,8 @@ base:
    - redis.adv_redis
    - influxdb.soc_influxdb
    - influxdb.adv_influxdb
+    - postgres.soc_postgres
+    - postgres.adv_postgres
    - elasticsearch.nodes
    - elasticsearch.soc_elasticsearch
    - elasticsearch.adv_elasticsearch
@@ -257,6 +273,9 @@ base:
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/elasticsearch/auth.sls') %}
    - elasticsearch.auth
    {% endif %}
+    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/postgres/auth.sls') %}
+    - postgres.auth
+    {% endif %}
    {% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/kibana/secrets.sls') %}
    - kibana.secrets
    {% endif %}
@@ -282,6 +301,8 @@ base:
    - redis.adv_redis
    - influxdb.soc_influxdb
    - influxdb.adv_influxdb
+    - postgres.soc_postgres
+    - postgres.adv_postgres
    - zeek.soc_zeek
    - zeek.adv_zeek
    - bpf.soc_bpf
@@ -29,10 +29,14 @@
    'manager',
    'nginx',
    'influxdb',
+    'postgres',
+    'postgres.auth',
    'soc',
    'kratos',
    'hydra',
    'elasticfleet',
+    'elasticfleet.manager',
+    'elasticsearch.cluster',
    'elastic-fleet-package-registry',
    'utility'
 ] %}
@@ -77,7 +81,7 @@
        ),
        'so-heavynode': (
            sensor_states +
-            ['elasticagent', 'elasticsearch', 'logstash', 'redis', 'nginx']
+            ['elasticagent', 'elasticsearch', 'elasticsearch.cluster', 'logstash', 'redis', 'nginx']
        ),
        'so-idh': (
            ['idh']
@@ -32,3 +32,4 @@ so_config_backup:
    - daymonth: '*'
    - month: '*'
    - dayweek: '*'
+
@@ -54,6 +54,20 @@ x509_signing_policies:
    - extendedKeyUsage: serverAuth
    - days_valid: 820
    - copypath: /etc/pki/issued_certs/
+  postgres:
+    - minions: '*'
+    - signing_private_key: /etc/pki/ca.key
+    - signing_cert: /etc/pki/ca.crt
+    - C: US
+    - ST: Utah
+    - L: Salt Lake City
+    - basicConstraints: "critical CA:false"
+    - keyUsage: "critical keyEncipherment"
+    - subjectKeyIdentifier: hash
+    - authorityKeyIdentifier: keyid,issuer:always
+    - extendedKeyUsage: serverAuth
+    - days_valid: 820
+    - copypath: /etc/pki/issued_certs/
  elasticfleet:
    - minions: '*'
    - signing_private_key: /etc/pki/ca.key
@@ -31,6 +31,7 @@ container_list() {
      "so-hydra"
      "so-nginx"
      "so-pcaptools"
+      "so-postgres"
      "so-soc"
      "so-suricata"
      "so-telegraf"
@@ -55,6 +56,7 @@ container_list() {
      "so-logstash"
      "so-nginx"
      "so-pcaptools"
+      "so-postgres"
      "so-redis"
      "so-soc"
      "so-strelka-backend"
@@ -162,8 +164,8 @@ update_docker_containers() {
    # Pull down the trusted docker image
    run_check_net_err \
    "docker pull $CONTAINER_REGISTRY/$IMAGEREPO/$image" \
-    "Could not pull $image, please ensure connectivity to $CONTAINER_REGISTRY" >> "$LOG_FILE" 2>&1 
-    
+    "Could not pull $image, please ensure connectivity to $CONTAINER_REGISTRY" >> "$LOG_FILE" 2>&1
+
    # Get signature
    run_check_net_err \
    "curl --retry 5 --retry-delay 60 -A '$CURLTYPE/$CURRENTVERSION/$OS/$(uname -r)' $sig_url --output $SIGNPATH/$image.sig" \
@@ -186,8 +188,27 @@ update_docker_containers() {
        if [ -z "$HOSTNAME" ]; then
          HOSTNAME=$(hostname)
        fi
-        docker tag $CONTAINER_REGISTRY/$IMAGEREPO/$image $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1 
-        docker push $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1 
+        docker tag $CONTAINER_REGISTRY/$IMAGEREPO/$image $HOSTNAME:5000/$IMAGEREPO/$image >> "$LOG_FILE" 2>&1 || {
+          echo "Unable to tag $image" >> "$LOG_FILE" 2>&1
+          exit 1
+        }
+        # Push to the embedded registry via a registry-to-registry copy. Avoids
+        # `docker push`, which on Docker 29.x with the containerd image store
+        # represents freshly-pulled images as an index whose layer content
+        # isn't reachable through the push path. The local `docker tag` above
+        # is preserved so so-image-pull's `:5000` existence check still works.
+        # Pin to the digest already gpg-verified above so we copy exactly the
+        # bytes we approved.
+        local VERIFIED_REF
+        VERIFIED_REF=$(echo "$DOCKERINSPECT" | jq -r ".[0].RepoDigests[] | select(. | contains(\"$CONTAINER_REGISTRY\"))" | head -n 1)
+        if [ -z "$VERIFIED_REF" ] || [ "$VERIFIED_REF" = "null" ]; then
+          echo "Unable to determine verified digest for $image" >> "$LOG_FILE" 2>&1
+          exit 1
+        fi
+        docker buildx imagetools create --tag $HOSTNAME:5000/$IMAGEREPO/$image "$VERIFIED_REF" >> "$LOG_FILE" 2>&1 || {
+          echo "Unable to copy $image to embedded registry" >> "$LOG_FILE" 2>&1
+          exit 1
+        }
      fi
    else
      echo "There is a problem downloading the $image image. Details: " >> "$LOG_FILE" 2>&1 
@@ -227,7 +227,7 @@ if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|from NIC checksum offloading" # zeek reporter.log
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|marked for removal"           # docker container getting recycled
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tcp 127.0.0.1:6791: bind: address already in use" # so-elastic-fleet agent restarting. Seen starting w/ 8.18.8 https://github.com/elastic/kibana/issues/201459
-    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint).*user so_kibana lacks the required permissions \[logs-\1" # Known issue with 3 integrations using kibana_system role vs creating unique api creds with proper permissions.
+    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint|armis|o365_metrics|microsoft_sentinel|snyk|cyera|island_browser).*user so_kibana lacks the required permissions \[(logs|metrics)-\1" # Known issue with integrations starting transform jobs that are explicitly not allowed to start as a system user. This error should not be seen on fresh ES 9.3.3 installs or after SO 3.1.0 with soups addition of check_transform_health_and_reauthorize()
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|manifest unknown"             # appears in so-dockerregistry log for so-tcpreplay following docker upgrade to 29.2.1-1
 fi

@@ -9,7 +9,7 @@

 . /usr/sbin/so-common

-software_raid=("SOSMN" "SOSMN-DE02" "SOSSNNV" "SOSSNNV-DE02" "SOS10k-DE02" "SOS10KNV" "SOS10KNV-DE02" "SOS10KNV-DE02" "SOS2000-DE02" "SOS-GOFAST-LT-DE02" "SOS-GOFAST-MD-DE02" "SOS-GOFAST-HV-DE02")
+software_raid=("SOSMN" "SOSMN-DE02" "SOSSNNV" "SOSSNNV-DE02" "SOS10k-DE02" "SOS10KNV" "SOS10KNV-DE02" "SOS10KNV-DE02" "SOS2000-DE02" "SOS-GOFAST-LT-DE02" "SOS-GOFAST-MD-DE02" "SOS-GOFAST-HV-DE02" "HVGUEST")
 hardware_raid=("SOS1000" "SOS1000F" "SOSSN7200" "SOS5000" "SOS4000")

 {%- if salt['grains.get']('sosmodel', '') %}
@@ -87,6 +87,11 @@ check_boss_raid() {
 }

 check_software_raid() {
+  if [[ ! -f /proc/mdstat ]]; then
+    SWRAID=0
+    return
+  fi
+
  SWRC=$(grep "_" /proc/mdstat)
  if [[ -n $SWRC ]]; then
      # RAID is failed in some way
@@ -107,7 +112,9 @@ if [[ "$is_hwraid" == "true" ]]; then
 fi
 if [[ "$is_softwareraid" == "true" ]]; then
 	check_software_raid
-  check_boss_raid
+  if [ "$model" != "HVGUEST" ]; then
+    check_boss_raid
+  fi
 fi

 sum=$(($SWRAID + $BOSSRAID + $HWRAID))
@@ -237,3 +237,11 @@ docker:
      extra_hosts: []
      extra_env: []
      ulimits: []
+    'so-postgres':
+      final_octet: 47
+      port_bindings:
+        - 0.0.0.0:5432:5432
+      custom_bind_mounts: []
+      extra_hosts: []
+      extra_env: []
+      ulimits: []
@@ -17,65 +17,17 @@ include:
  - logstash.ssl
  - elasticfleet.config
  - elasticfleet.sostatus
+{%- if GLOBALS.role != "so-fleet" %}
+  - elasticfleet.manager
+{%- endif %}

-{% if grains.role not in ['so-fleet'] %}
+{% if GLOBALS.role != "so-fleet" %}
 # Wait for Elasticsearch to be ready - no reason to try running Elastic Fleet server if ES is not ready
 wait_for_elasticsearch_elasticfleet:
  cmd.run:
    - name: so-elasticsearch-wait
-{% endif %}
-
-# If enabled, automatically update Fleet Logstash Outputs
-{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval', 'so-fleet'] %}
-so-elastic-fleet-auto-configure-logstash-outputs:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-outputs-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-{# Separate from above in order to catch elasticfleet-logstash.crt changes and force update to fleet output policy #}
-so-elastic-fleet-auto-configure-logstash-outputs-force:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-outputs-update --certs
-    - retry:
-        attempts: 4
-        interval: 30
-    - onchanges:
-        - x509: etc_elasticfleet_logstash_crt
-        - x509: elasticfleet_kafka_crt
-{% endif %}
-
-# If enabled, automatically update Fleet Server URLs & ES Connection
-{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-fleet'] %}
-so-elastic-fleet-auto-configure-server-urls:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-urls-update
-    - retry:
-        attempts: 4
-        interval: 30
-{% endif %}
-
-# Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
-{% if grains.role not in ['so-fleet'] %}
-so-elastic-fleet-auto-configure-elasticsearch-urls:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-es-url-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-so-elastic-fleet-auto-configure-artifact-urls:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-artifacts-url-update
-    - retry:
-        attempts: 4
-        interval: 30
-
-{% endif %}

 # Sync Elastic Agent artifacts to Fleet Node
-{% if grains.role in ['so-fleet'] %}
 elasticagent_syncartifacts:
  file.recurse:
    - name: /nsm/elastic-fleet/artifacts/beats
@@ -149,57 +101,6 @@ so-elastic-fleet:
      - x509: etc_elasticfleet_crt
 {%   endif %}

-{%  if GLOBALS.role != "so-fleet" %}
-so-elastic-fleet-package-statefile:
-  file.managed:
-    - name: /opt/so/state/elastic_fleet_packages.txt
-    - contents: {{ELASTICFLEETMERGED.packages}}
-
-so-elastic-fleet-package-upgrade:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-package-upgrade
-    - retry:
-        attempts: 3
-        interval: 10
-    - onchanges:
-      - file: /opt/so/state/elastic_fleet_packages.txt
-
-so-elastic-fleet-integrations:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-integration-policy-load
-    - retry:
-        attempts: 3
-        interval: 10
-
-so-elastic-agent-grid-upgrade:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-agent-grid-upgrade
-    - retry:
-        attempts: 12
-        interval: 5
-
-so-elastic-fleet-integration-upgrade:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-integration-upgrade
-    - retry:
-        attempts: 3
-        interval: 10
-
-{# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
-so-elastic-fleet-addon-integrations:
-  cmd.run:
-    - name: /usr/sbin/so-elastic-fleet-optional-integrations-load
-
-{%   if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
-so-elastic-defend-manage-filters-file-watch:
-  cmd.run:
-    - name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
-    - onchanges:
-      - file: elasticdefendcustom
-      - file: elasticdefenddisabled
-{%    endif %}
-{%  endif %}
-
 delete_so-elastic-fleet_so-status.disabled:
  file.uncomment:
    - name: /opt/so/conf/so-status/so-status.conf
@@ -0,0 +1,101 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls in allowed_states %}
+{%   from 'elasticfleet/map.jinja' import ELASTICFLEETMERGED %}
+
+include:
+  - elasticfleet.config
+
+# If enabled, automatically update Fleet Logstash Outputs
+{% if ELASTICFLEETMERGED.config.server.enable_auto_configuration and grains.role not in ['so-import', 'so-eval'] %}
+so-elastic-fleet-auto-configure-logstash-outputs:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-outputs-update
+    - retry:
+        attempts: 4
+        interval: 30
+{% endif %}
+
+# If enabled, automatically update Fleet Server URLs & ES Connection
+so-elastic-fleet-auto-configure-server-urls:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-urls-update
+    - retry:
+        attempts: 4
+        interval: 30
+
+# Automatically update Fleet Server Elasticsearch URLs & Agent Artifact URLs
+so-elastic-fleet-auto-configure-elasticsearch-urls:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-es-url-update
+    - retry:
+        attempts: 4
+        interval: 30
+
+so-elastic-fleet-auto-configure-artifact-urls:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-artifacts-url-update
+    - retry:
+        attempts: 4
+        interval: 30
+
+so-elastic-fleet-package-statefile:
+  file.managed:
+    - name: /opt/so/state/elastic_fleet_packages.txt
+    - contents: {{ELASTICFLEETMERGED.packages}}
+
+so-elastic-fleet-package-upgrade:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-package-upgrade
+    - retry:
+        attempts: 3
+        interval: 10
+    - onchanges:
+      - file: /opt/so/state/elastic_fleet_packages.txt
+
+so-elastic-fleet-integrations:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-integration-policy-load
+    - retry:
+        attempts: 3
+        interval: 10
+
+so-elastic-agent-grid-upgrade:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-agent-grid-upgrade
+    - retry:
+        attempts: 12
+        interval: 5
+
+so-elastic-fleet-integration-upgrade:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-integration-upgrade
+    - retry:
+        attempts: 3
+        interval: 10
+
+{# Optional integrations script doesn't need the retries like so-elastic-fleet-integration-upgrade which loads the default integrations #}
+so-elastic-fleet-addon-integrations:
+  cmd.run:
+    - name: /usr/sbin/so-elastic-fleet-optional-integrations-load
+
+{% if ELASTICFLEETMERGED.config.defend_filters.enable_auto_configuration %}
+so-elastic-defend-manage-filters-file-watch:
+  cmd.run:
+    - name: python3 /sbin/so-elastic-defend-manage-filters.py -c /opt/so/conf/elasticsearch/curl.config -d /opt/so/conf/elastic-fleet/defend-exclusions/disabled-filters.yaml -i /nsm/securityonion-resources/event_filters/ -i /opt/so/conf/elastic-fleet/defend-exclusions/rulesets/custom-filters/ &>> /opt/so/log/elasticfleet/elastic-defend-manage-filters.log
+    - onchanges:
+      - file: elasticdefendcustom
+      - file: elasticdefenddisabled
+{% endif %}
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -240,7 +240,7 @@ elastic_fleet_policy_create() {
        --arg DESC "$DESC" \
        --arg TIMEOUT $TIMEOUT \
        --arg FLEETSERVER "$FLEETSERVER" \
-            '{"name": $NAME,"id":$NAME,"description":$DESC,"namespace":"default","monitoring_enabled":["logs"],"inactivity_timeout":$TIMEOUT,"has_fleet_server":$FLEETSERVER}'
+            '{"name": $NAME,"id":$NAME,"description":$DESC,"namespace":"default","monitoring_enabled":["logs"],"inactivity_timeout":$TIMEOUT,"has_fleet_server":$FLEETSERVER,"advanced_settings":{"agent_logging_level": "warning"}}'
        )
    # Create Fleet Policy
    if ! fleet_api "agent_policies" -XPOST -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING"; then
@@ -5,11 +5,12 @@
 # this file except in compliance with the Elastic License 2.0.

 . /usr/sbin/so-common
+. /usr/sbin/so-elastic-fleet-common
 {%- import_yaml 'elasticsearch/defaults.yaml' as ELASTICSEARCHDEFAULTS %}
 {%- import_yaml 'elasticfleet/defaults.yaml' as ELASTICFLEETDEFAULTS %}
 {# Optionally override Elasticsearch version for Elastic Agent patch releases #}
 {%- if ELASTICFLEETDEFAULTS.elasticfleet.patch_version is defined %}
-{%-   do ELASTICSEARCHDEFAULTS.update({'elasticsearch': {'version': ELASTICFLEETDEFAULTS.elasticfleet.patch_version}}) %}
+{%-   do ELASTICSEARCHDEFAULTS.elasticsearch.update({'version': ELASTICFLEETDEFAULTS.elasticfleet.patch_version}) %}
 {%- endif %}

 # Only run on Managers
@@ -19,13 +20,10 @@ if ! is_manager_node; then
 fi

 # Get current list of Grid Node Agents that need to be upgraded
-RAW_JSON=$(curl -K /opt/so/conf/elasticsearch/curl.config -L "http://localhost:5601/api/fleet/agents?perPage=20&page=1&kuery=NOT%20agent.version%3A%20{{ELASTICSEARCHDEFAULTS.elasticsearch.version}}%20AND%20policy_id%3A%20so-grid-nodes_%2A&showInactive=false&getStatusSummary=true" --retry 3 --retry-delay 30 --fail 2>/dev/null)
+if ! RAW_JSON=$(fleet_api "agents?perPage=20&page=1&kuery=NOT%20agent.version%3A%20{{ELASTICSEARCHDEFAULTS.elasticsearch.version | urlencode }}%20AND%20policy_id%3A%20so-grid-nodes_%2A&showInactive=false&getStatusSummary=true" -H 'kbn-xsrf: true' -H 'Content-Type: application/json'); then

-# Check to make sure that the server responded with good data - else, bail from script
-CHECKSUM=$(jq -r '.page' <<< "$RAW_JSON")
-if [ "$CHECKSUM" -ne 1 ]; then
- printf "Failed to query for current Grid Agents...\n"
- exit 1
+    printf "Failed to query for current Grid Agents...\n"
+    exit 1
 fi

 # Generate list of Node Agents that need updates
@@ -36,10 +34,12 @@ if [ "$OUTDATED_LIST" != '[]' ]; then
   printf "Initiating upgrades for $AGENTNUMBERS Agents to Elastic {{ELASTICSEARCHDEFAULTS.elasticsearch.version}}...\n\n"

   # Generate updated JSON payload
-   JSON_STRING=$(jq -n --arg ELASTICVERSION {{ELASTICSEARCHDEFAULTS.elasticsearch.version}} --arg UPDATELIST $OUTDATED_LIST '{"version": $ELASTICVERSION,"agents": $UPDATELIST }')
+   JSON_STRING=$(jq -n --arg ELASTICVERSION "{{ELASTICSEARCHDEFAULTS.elasticsearch.version}}" --argjson UPDATELIST "$OUTDATED_LIST" '{"version": $ELASTICVERSION,"agents": $UPDATELIST }')

   # Update Node Agents
-   curl -K /opt/so/conf/elasticsearch/curl.config -L -X POST "http://localhost:5601/api/fleet/agents/bulk_upgrade" -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING"
+   if ! fleet_api "agents/bulk_upgrade" -XPOST -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$JSON_STRING"; then
+       printf "Failed to initiate Agent upgrades...\n"
+   fi
 else
    printf "No Agents need updates... Exiting\n\n"
    exit 0
@@ -235,6 +235,16 @@ function update_kafka_outputs() {

 {% endif %}

+# Compare the current Elastic Fleet certificate against what is on disk
+POLICY_CERT_SHA=$(jq -r '.item.ssl.certificate' <<< $RAW_JSON | openssl x509 -noout -sha256 -fingerprint)
+DISK_CERT_SHA=$(openssl x509 -in /etc/pki/elasticfleet-logstash.crt -noout -sha256 -fingerprint)
+
+if [[ "$POLICY_CERT_SHA" != "$DISK_CERT_SHA" ]]; then
+    printf "Certificate on disk doesn't match certificate in policy - forcing update\n"
+    UPDATE_CERTS=true
+    FORCE_UPDATE=true
+fi
+
 # Sort & hash the new list of Logstash Outputs
 NEW_LIST_JSON=$(jq --compact-output --null-input '$ARGS.positional' --args -- "${NEW_LIST[@]}")
 NEW_HASH=$(sha256sum <<< "$NEW_LIST_JSON" | awk '{print $1}')
@@ -4,7 +4,7 @@
 # Elastic License 2.0.

 {% from 'allowed_states.map.jinja' import allowed_states %}
-{% if sls.split('.')[0] in allowed_states %}
+{% if sls in allowed_states %}
 {%   from 'vars/globals.map.jinja' import GLOBALS %}
 {%   from 'elasticsearch/config.map.jinja' import ELASTICSEARCHMERGED %}
 {%   from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS, SO_MANAGED_INDICES %}
@@ -17,7 +17,7 @@ include:
  - elasticsearch.ssl
  - elasticsearch.config
  - elasticsearch.sostatus
-{%- if GLOBALS.role != 'so-searchode' %}
+{%- if GLOBALS.role != "so-searchnode" %}
  - elasticsearch.cluster
 {%- endif%}

@@ -102,11 +102,6 @@ so-elasticsearch:
      - cmd: auth_users_roles_inode
      - cmd: auth_users_inode

-delete_so-elasticsearch_so-status.disabled:
-  file.uncomment:
-    - name: /opt/so/conf/so-status/so-status.conf
-    - regex: ^so-elasticsearch$
-
 wait_for_so-elasticsearch:
  http.wait_for_successful_query:
    - name: "https://localhost:9200/"
@@ -117,10 +112,14 @@ wait_for_so-elasticsearch:
    - status: 200
    - wait_for: 300
    - request_interval: 15
-    - backend: requests
    - require:
      - docker_container: so-elasticsearch

+delete_so-elasticsearch_so-status.disabled:
+  file.uncomment:
+    - name: /opt/so/conf/so-status/so-status.conf
+    - regex: ^so-elasticsearch$
+
 {% else %}

 {{sls}}_state_not_allowed:
@@ -103,11 +103,13 @@ load_component_templates() {
    local pattern="${ELASTICSEARCH_TEMPLATES_DIR}/component/$2"
    local append_mappings="${3:-"false"}"

-    # current state of nullglob shell option
-    shopt -q nullglob && nullglob_set=1 || nullglob_set=0
-
-    shopt -s nullglob
    echo -e "\nLoading $printed_name component templates...\n"
+
+    if ! compgen -G "${pattern}/*.json" > /dev/null; then
+        echo "No $printed_name component templates found in ${pattern}, skipping."
+        return
+    fi
+
    for component in "$pattern"/*.json; do
        tmpl_name=$(basename "${component%.json}")

@@ -121,11 +123,6 @@ load_component_templates() {
            SO_LOAD_FAILURES_NAMES+=("$component")
        fi
    done
-
-    # restore nullglob shell option if needed
-    if [[ $nullglob_set -eq 1 ]]; then
-        shopt -u nullglob
-    fi
 }

 check_elasticsearch_responsive() {
@@ -136,7 +133,32 @@ check_elasticsearch_responsive() {
        fail "Elasticsearch is not responding. Please review Elasticsearch logs /opt/so/log/elasticsearch/securityonion.log for more details. Additionally, consider running so-elasticsearch-troubleshoot."
 }

-if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]]; then
+index_templates_exist() {
+    local templates_dir="$1"
+
+    if [[ ! -d "$templates_dir" ]]; then
+        return 1
+    fi
+
+    compgen -G "${templates_dir}/*.json" > /dev/null
+}
+
+should_load_addon_templates() {
+    if [[ "$IS_HEAVYNODE" == "true" ]]; then
+        return 1
+    fi
+
+    # Skip statefile checks when forcing template load
+    if [[ "$FORCE" != "true" ]]; then
+        if [[ ! -f "$SO_STATEFILE_SUCCESS" || -f "$ADDON_STATEFILE_SUCCESS" ]]; then
+            return 1
+        fi
+    fi
+
+    index_templates_exist "$ADDON_TEMPLATES_DIR"
+}
+
+if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]] && index_templates_exist "$SO_TEMPLATES_DIR"; then
    check_elasticsearch_responsive

    if [[ "$IS_HEAVYNODE" == "false" ]]; then
@@ -201,13 +223,14 @@ if [[ "$FORCE" == "true" || ! -f "$SO_STATEFILE_SUCCESS" ]]; then
            fail "Failed to load all Security Onion core templates successfully."
        fi
    fi
-else
-
+elif ! index_templates_exist "$SO_TEMPLATES_DIR"; then
+    echo "No Security Onion core index templates found in ${SO_TEMPLATES_DIR}, skipping."
+elif [[ -f "$SO_STATEFILE_SUCCESS" ]]; then
    echo "Security Onion core templates already loaded"
 fi

 # Start loading addon templates
-if [[ (-d "$ADDON_TEMPLATES_DIR" && -f "$SO_STATEFILE_SUCCESS" && "$IS_HEAVYNODE" == "false" && ! -f "$ADDON_STATEFILE_SUCCESS") || (-d "$ADDON_TEMPLATES_DIR" && "$IS_HEAVYNODE" == "false" && "$FORCE" == "true") ]]; then
+if should_load_addon_templates; then

    check_elasticsearch_responsive

@@ -11,6 +11,7 @@
     'so-kratos',
     'so-hydra',
     'so-nginx',
+     'so-postgres',
     'so-redis',
     'so-soc',
     'so-strelka-coordinator',
@@ -34,6 +35,7 @@
     'so-hydra',
     'so-logstash',
     'so-nginx',
+     'so-postgres',
     'so-redis',
     'so-soc',
     'so-strelka-coordinator',
@@ -77,6 +79,7 @@
     'so-kratos',
     'so-hydra',
     'so-nginx',
+     'so-postgres',
     'so-soc'
 ] %}

@@ -98,6 +98,10 @@ firewall:
      tcp:
        - 8086
      udp: []
+    postgres:
+      tcp:
+        - 5432
+      udp: []
    kafka_controller:
      tcp:
        - 9093
@@ -193,6 +197,7 @@ firewall:
                - kibana
                - redis
                - influxdb
+                - postgres
                - elasticsearch_rest
                - elasticsearch_node
                - localrules
@@ -379,6 +384,7 @@ firewall:
                - kibana
                - redis
                - influxdb
+                - postgres
                - elasticsearch_rest
                - elasticsearch_node
                - docker_registry
@@ -392,6 +398,7 @@ firewall:
                - elasticsearch_rest
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -404,6 +411,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -421,6 +429,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
            searchnode:
              portgroups:
@@ -431,6 +440,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -444,6 +454,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -453,6 +464,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -486,6 +498,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - elastic_agent_control
@@ -496,6 +509,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -590,6 +604,7 @@ firewall:
                - kibana
                - redis
                - influxdb
+                - postgres
                - elasticsearch_rest
                - elasticsearch_node
                - docker_registry
@@ -603,6 +618,7 @@ firewall:
                - elasticsearch_rest
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -615,6 +631,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -632,6 +649,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
            searchnode:
              portgroups:
@@ -642,6 +660,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -655,6 +674,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -664,6 +684,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -695,6 +716,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - elastic_agent_control
@@ -705,6 +727,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -799,6 +822,7 @@ firewall:
                - kibana
                - redis
                - influxdb
+                - postgres
                - elasticsearch_rest
                - elasticsearch_node
                - docker_registry
@@ -812,6 +836,7 @@ firewall:
                - elasticsearch_rest
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -824,6 +849,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -841,6 +867,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
            searchnode:
              portgroups:
@@ -850,6 +877,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -862,6 +890,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -871,6 +900,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -904,6 +934,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - elastic_agent_control
@@ -914,6 +945,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -1011,6 +1043,7 @@ firewall:
                - kibana
                - redis
                - influxdb
+                - postgres
                - elasticsearch_rest
                - elasticsearch_node
                - docker_registry
@@ -1031,6 +1064,7 @@ firewall:
                - elasticsearch_rest
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -1043,6 +1077,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -1054,6 +1089,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - beats_5044
@@ -1065,6 +1101,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - redis
@@ -1074,6 +1111,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - redis
@@ -1084,6 +1122,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -1120,6 +1159,7 @@ firewall:
              portgroups:
                - docker_registry
                - influxdb
+                - postgres
                - sensoroni
                - yum
                - elastic_agent_control
@@ -1130,6 +1170,7 @@ firewall:
                - yum
                - docker_registry
                - influxdb
+                - postgres
                - elastic_agent_control
                - elastic_agent_data
                - elastic_agent_update
@@ -1473,6 +1514,7 @@ firewall:
                - kibana
                - redis
                - influxdb
+                - postgres
                - elasticsearch_rest
                - elasticsearch_node
                - elastic_agent_control
@@ -22,7 +22,7 @@ kibana:
          - default
          - file
    migrations:
-      discardCorruptObjects: "8.18.8"
+      discardCorruptObjects: "9.3.3"
    telemetry:
      enabled: False
    xpack:
@@ -3,8 +3,8 @@ kratos:
    description: Enables or disables the Kratos authentication system. WARNING - Disabling this process will cause the grid to malfunction. Re-enabling this setting will require manual effort via SSH.
    forcedType: bool
    advanced: True
+    readonly: True
    helpLink: kratos
-
  oidc:
    enabled:
      description: Set to True to enable OIDC / Single Sign-On (SSO) to SOC. Requires a valid Security Onion license key.
@@ -273,7 +273,7 @@ function deleteMinionFiles () {
 		log "ERROR" "Failed to delete $PILLARFILE"
 		return 1
 	fi
-	
+
 	rm -f $ADVPILLARFILE
 	if [ $? -ne 0 ]; then
 		log "ERROR" "Failed to delete $ADVPILLARFILE"
@@ -281,6 +281,39 @@ function deleteMinionFiles () {
 	fi
 }

+# Remove this minion's postgres Telegraf credential from the shared creds
+# pillar and drop the matching role in Postgres. Always returns 0 so a dead
+# or unreachable so-postgres doesn't block minion deletion — in that case we
+# log a warning and leave the role behind for manual cleanup.
+function remove_postgres_telegraf_from_minion() {
+	local MINION_SAFE
+	MINION_SAFE=$(echo "$MINION_ID" | tr '.-' '__' | tr '[:upper:]' '[:lower:]')
+	local PG_USER="so_telegraf_${MINION_SAFE}"
+
+	log "INFO" "Removing postgres telegraf cred for $MINION_ID"
+
+	so-telegraf-cred remove "$MINION_ID" >/dev/null 2>&1 || true
+
+	if docker ps --format '{{.Names}}' 2>/dev/null | grep -q '^so-postgres$'; then
+		if ! docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf >/dev/null 2>&1 <<EOSQL
+DO \$\$
+BEGIN
+    IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '$PG_USER') THEN
+        EXECUTE format('REASSIGN OWNED BY %I TO so_telegraf', '$PG_USER');
+        EXECUTE format('DROP OWNED BY %I', '$PG_USER');
+        EXECUTE format('DROP ROLE %I', '$PG_USER');
+    END IF;
+END
+\$\$;
+EOSQL
+		then
+			log "WARN" "Failed to drop postgres role $PG_USER; pillar entry was removed — drop manually if the role persists"
+		fi
+	else
+		log "WARN" "so-postgres container is not running; skipping DB role cleanup for $PG_USER"
+	fi
+}
+
 # Create the minion file
 function ensure_socore_ownership() {
 	log "INFO" "Setting socore ownership on minion files"
@@ -542,6 +575,17 @@ function add_telegraf_to_minion() {
        log "ERROR" "Failed to add telegraf configuration to $PILLARFILE"
        return 1
    fi
+
+    # Provision the per-minion postgres Telegraf credential in the shared
+    # telegraf/creds.sls pillar. so-telegraf-cred is the only writer; it
+    # generates a password on first add and is a no-op on re-add so the cred
+    # is stable across repeated so-minion runs. postgres.telegraf_users on the
+    # manager creates/updates the DB role from the same pillar.
+    so-telegraf-cred add "$MINION_ID"
+    if [ $? -ne 0 ]; then
+        log "ERROR" "Failed to provision postgres telegraf cred for $MINION_ID"
+        return 1
+    fi
 }

 function add_influxdb_to_minion() {
@@ -1069,6 +1113,7 @@ case "$OPERATION" in

 	"delete")
 		log "INFO" "Removing minion $MINION_ID"
+		remove_postgres_telegraf_from_minion
 		deleteMinionFiles || {
 			log "ERROR" "Failed to delete minion files for $MINION_ID"
 			exit 1
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Single writer for the Telegraf Postgres credentials pillar. Thin wrapper
+# around so-yaml.py that generates a password on first add and no-ops on
+# re-add so the cred is stable across repeated so-minion runs.
+#
+# Note: so-yaml.py splits keys on '.' with no escape. SO minion ids are
+# dot-free by construction (setup/so-functions:1884 takes the short_name
+# before the first '.'), so using the raw minion id as the key is safe.
+
+CREDS=/opt/so/saltstack/local/pillar/telegraf/creds.sls
+
+usage() {
+    echo "Usage: $0 <add|remove> <minion_id>" >&2
+    exit 2
+}
+
+seed_creds_file() {
+    mkdir -p "$(dirname "$CREDS")" || return 1
+    if [[ ! -f "$CREDS" ]]; then
+        (umask 027 && printf 'telegraf:\n  postgres_creds: {}\n' > "$CREDS") || return 1
+        chown socore:socore "$CREDS" 2>/dev/null || true
+        chmod 640 "$CREDS" || return 1
+    fi
+}
+
+OP=$1
+MID=$2
+[[ -z "$OP" || -z "$MID" ]] && usage
+
+case "$OP" in
+    add)
+        SAFE=$(echo "$MID" | tr '.-' '__' | tr '[:upper:]' '[:lower:]')
+        seed_creds_file || exit 1
+        if so-yaml.py get -r "$CREDS" "telegraf.postgres_creds.${MID}.user" >/dev/null 2>&1; then
+            exit 0
+        fi
+        PASS=$(tr -dc 'A-Za-z0-9~!@#^&*()_=+[]|;:,.<>?-' < /dev/urandom | head -c 72)
+        so-yaml.py replace "$CREDS" "telegraf.postgres_creds.${MID}.user" "so_telegraf_${SAFE}" >/dev/null
+        so-yaml.py replace "$CREDS" "telegraf.postgres_creds.${MID}.pass" "$PASS" >/dev/null
+        ;;
+    remove)
+        [[ -f "$CREDS" ]] || exit 0
+        so-yaml.py remove "$CREDS" "telegraf.postgres_creds.${MID}" >/dev/null 2>&1 || true
+        ;;
+    *)
+        usage
+        ;;
+esac
@@ -39,9 +39,16 @@ def showUsage(args):


 def loadYaml(filename):
-    file = open(filename, "r")
-    content = file.read()
-    return yaml.safe_load(content)
+    try:
+        with open(filename, "r") as file:
+            content = file.read()
+            return yaml.safe_load(content)
+    except FileNotFoundError:
+        print(f"File not found: {filename}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error reading file {filename}: {e}", file=sys.stderr)
+        sys.exit(1)


 def writeYaml(filename, content):
@@ -285,7 +292,8 @@ def add(args):
 def removeKey(content, key):
    pieces = key.split(".", 1)
    if len(pieces) > 1:
-        removeKey(content[pieces[0]], pieces[1])
+        if pieces[0] in content:
+            removeKey(content[pieces[0]], pieces[1])
    else:
        content.pop(key, None)

@@ -973,3 +973,21 @@ class TestReplaceListObject(unittest.TestCase):

        expected = "key1:\n- id: '1'\n  status: updated\n- id: '2'\n  status: inactive\n"
        self.assertEqual(actual, expected)
+
+
+class TestLoadYaml(unittest.TestCase):
+
+    def test_load_yaml_missing_file(self):
+        with patch('sys.exit', new=MagicMock()) as sysmock:
+            with patch('sys.stderr', new=StringIO()) as mock_stderr:
+                soyaml.loadYaml("/tmp/so-yaml_test-does-not-exist.yaml")
+                sysmock.assert_called_with(1)
+                self.assertIn("File not found:", mock_stderr.getvalue())
+
+    def test_load_yaml_read_error(self):
+        with patch('sys.exit', new=MagicMock()) as sysmock:
+            with patch('sys.stderr', new=StringIO()) as mock_stderr:
+                with patch('builtins.open', side_effect=PermissionError("denied")):
+                    soyaml.loadYaml("/tmp/so-yaml_test-unreadable.yaml")
+                    sysmock.assert_called_with(1)
+                    self.assertIn("Error reading file", mock_stderr.getvalue())
@@ -24,6 +24,14 @@ BACKUPTOPFILE=/opt/so/saltstack/default/salt/top.sls.backup
 SALTUPGRADED=false
 SALT_CLOUD_INSTALLED=false
 SALT_CLOUD_CONFIGURED=false
+# Check if salt-cloud is installed
+if rpm -q salt-cloud &>/dev/null; then
+  SALT_CLOUD_INSTALLED=true
+fi
+# Check if salt-cloud is configured
+if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
+  SALT_CLOUD_CONFIGURED=true
+fi
 # used to display messages to the user at the end of soup
 declare -a FINAL_MESSAGE_QUEUE=()

@@ -477,7 +485,168 @@ elasticsearch_backup_index_templates() {
  tar -czf /nsm/backup/3.0.0_elasticsearch_index_templates.tar.gz -C /opt/so/conf/elasticsearch/templates/index/ .
 }

+elasticfleet_set_agent_logging_level_warn() {
+    . /usr/sbin/so-elastic-fleet-common
+
+    local current_agent_policies
+    if ! current_agent_policies=$(fleet_api "agent_policies?perPage=1000"); then
+        echo "Warning: unable to retrieve Fleet agent policies"
+        return 0
+    fi
+
+    # Only updating policies that are within Security Onion defaults and do not already have any user configured advanced_settings.
+    local policies_to_update
+    policies_to_update=$(jq -c '
+        .items[]
+        | select(has("advanced_settings") | not)
+        | select(
+            .id == "so-grid-nodes_general"
+            or .id == "so-grid-nodes_heavy"
+            or .id == "endpoints-initial"
+            or (.id | startswith("FleetServer_"))
+          )
+    ' <<< "$current_agent_policies")
+
+    if [[ -z "$policies_to_update" ]]; then
+        return 0
+    fi
+
+    while IFS= read -r policy; do
+        [[ -z "$policy" ]] && continue
+
+        local policy_id policy_name policy_namespace
+        policy_id=$(jq -r '.id' <<< "$policy")
+        policy_name=$(jq -r '.name' <<< "$policy")
+        policy_namespace=$(jq -r '.namespace' <<< "$policy")
+
+        local update_logging
+        update_logging=$(jq -n \
+            --arg name "$policy_name" \
+            --arg namespace "$policy_namespace" \
+            '{name: $name, namespace: $namespace, advanced_settings: {agent_logging_level: "warning"}}'
+        )
+
+        echo "Setting elastic agent_logging_level to warning on policy '$policy_name' ($policy_id)."
+        if ! fleet_api "agent_policies/$policy_id" -XPUT -H 'kbn-xsrf: true' -H 'Content-Type: application/json' -d "$update_logging" >/dev/null; then
+            echo "  warning: failed to update agent policy '$policy_name' ($policy_id)" >&2
+        fi
+    done <<< "$policies_to_update"
+}
+
+check_transform_health_and_reauthorize() {
+    . /usr/sbin/so-elastic-fleet-common
+
+    echo "Checking integration transform jobs for unhealthy / unauthorized status..."
+
+    local transforms_doc stats_doc installed_doc
+    if ! transforms_doc=$(so-elasticsearch-query "_transform/_all?size=1000" --fail --retry 3 --retry-delay 5 2>/dev/null); then
+        echo "Unable to query for transform jobs, skipping reauthorization."
+        return 0
+    fi
+    if ! stats_doc=$(so-elasticsearch-query "_transform/_all/_stats?size=1000" --fail --retry 3 --retry-delay 5 2>/dev/null); then
+        echo "Unable to query for transform job stats, skipping reauthorization."
+        return 0
+    fi
+    if ! installed_doc=$(fleet_api "epm/packages/installed?perPage=500"); then
+        echo "Unable to list installed Fleet packages, skipping reauthorization."
+        return 0
+    fi
+
+    # Get all transforms that meet the following
+    # - unhealthy (any non-green health status)
+    # - metadata has run_as_kibana_system: false (this fix is specific to transforms started prior to Kibana 9.3.3)
+    # - are not orphaned (integration is not somehow missing/corrupt/uninstalled)
+    local unhealthy_transforms
+    unhealthy_transforms=$(jq -c -n \
+        --argjson t "$transforms_doc" \
+        --argjson s "$stats_doc" \
+        --argjson i "$installed_doc" '
+        ($i.items | map({key: .name, value: .version}) | from_entries) as $pkg_ver
+        | ($s.transforms | map({key: .id, value: .health.status}) | from_entries) as $health
+        | [ $t.transforms[]
+            | select(._meta.run_as_kibana_system == false)
+            | select(($health[.id] // "unknown") != "green")
+            | {id, pkg: ._meta.package.name, ver: ($pkg_ver[._meta.package.name])}
+          ]
+        | if length == 0 then empty else . end
+        | (map(select(.ver == null)) | map({orphan: .id})[]),
+          (map(select(.ver != null))
+           | group_by(.pkg)
+           | map({pkg: .[0].pkg, ver: .[0].ver, transformIds: map(.id)})[])
+    ')
+
+    if [[ -z "$unhealthy_transforms" ]]; then
+        return 0
+    fi
+
+    local unhealthy_count
+    unhealthy_count=$(jq -s '[.[].transformIds? // empty | .[]] | length' <<< "$unhealthy_transforms")
+    echo "Found $unhealthy_count transform(s) needing reauthorization."
+
+    local total_failures=0
+    while IFS= read -r transform; do
+        [[ -z "$transform" ]] && continue
+        if jq -e 'has("orphan")' <<< "$transform" >/dev/null 2>&1; then
+            echo "Skipping transform not owned by any installed Fleet package: $(jq -r '.orphan' <<< "$transform")"
+            continue
+        fi
+
+        local pkg ver body resp
+        pkg=$(jq -r '.pkg' <<< "$transform")
+        ver=$(jq -r '.ver' <<< "$transform")
+        body=$(jq -c '{transforms: (.transformIds | map({transformId: .}))}' <<< "$transform")
+
+        echo "Reauthorizing transform(s) for ${pkg}-${ver}..."
+        resp=$(fleet_api "epm/packages/${pkg}/${ver}/transforms/authorize" \
+                        -XPOST -H 'kbn-xsrf: true' -H 'Content-Type: application/json' \
+                        -d "$body") || { echo "Could not reauthorize transform(s) for ${pkg}-${ver}"; continue; }
+
+        (( total_failures += $(jq 'map(select(.success != true)) | length' <<< "$resp" 2>/dev/null) ))
+    done <<< "$unhealthy_transforms"
+
+    if [[ "$total_failures" -gt 0 ]]; then
+        echo "Some transform(s) failed to reauthorize."
+    fi
+}
+
+ensure_postgres_local_pillar() {
+  # Postgres was added as a service after 3.0.0, so the new pillar/top.sls
+  # references postgres.soc_postgres / postgres.adv_postgres unconditionally.
+  # Managers upgrading from 3.0.0 have no /opt/so/saltstack/local/pillar/postgres/
+  # (make_some_dirs only runs at install time), so the stubs must be created
+  # here before salt-master restarts against the new top.sls.
+  echo "Ensuring postgres local pillar stubs exist."
+  local dir=/opt/so/saltstack/local/pillar/postgres
+  mkdir -p "$dir"
+  [[ -f "$dir/soc_postgres.sls" ]] || touch "$dir/soc_postgres.sls"
+  [[ -f "$dir/adv_postgres.sls" ]] || touch "$dir/adv_postgres.sls"
+  chown -R socore:socore "$dir"
+}
+
+ensure_postgres_secret() {
+  # On a fresh install, generate_passwords + secrets_pillar seed
+  # secrets:postgres_pass in /opt/so/saltstack/local/pillar/secrets.sls. That
+  # code path is skipped on upgrade (secrets.sls already exists from 3.0.0
+  # with import_pass/influx_pass but no postgres_pass), so the postgres
+  # container's POSTGRES_PASSWORD_FILE and SOC's PG_ADMIN_PASS would be empty
+  # after highstate. Generate one now if missing.
+  local secrets_file=/opt/so/saltstack/local/pillar/secrets.sls
+  if [[ ! -f "$secrets_file" ]]; then
+    echo "WARNING: $secrets_file missing; skipping postgres_pass backfill."
+    return 0
+  fi
+  if so-yaml.py get -r "$secrets_file" secrets.postgres_pass >/dev/null 2>&1; then
+    echo "secrets.postgres_pass already set; leaving as-is."
+    return 0
+  fi
+  echo "Seeding secrets.postgres_pass in $secrets_file."
+  so-yaml.py add "$secrets_file" secrets.postgres_pass "$(get_random_value)"
+  chown socore:socore "$secrets_file"
+}
+
 up_to_3.1.0() {
+  ensure_postgres_local_pillar
+  ensure_postgres_secret
  determine_elastic_agent_upgrade
  elasticsearch_backup_index_templates
  # Clear existing component template state file.
@@ -489,6 +658,30 @@ up_to_3.1.0() {

 post_to_3.1.0() {
  /usr/sbin/so-kibana-space-defaults
+  # ensure manager has new version of socloud.conf
+  if [[ $SALT_CLOUD_CONFIGURED == true ]]; then
+    salt-call state.apply salt.cloud.config concurrent=True
+  fi
+
+  # Backfill the Telegraf creds pillar for every accepted minion. so-telegraf-cred
+  # add is idempotent — it no-ops when an entry already exists — so this is safe
+  # to run on every soup. The subsequent state.apply creates/updates the matching
+  # Postgres roles from the reconciled pillar.
+  echo "Reconciling Telegraf Postgres creds for accepted minions."
+  for mid in $(salt-key --out=json --list=accepted 2>/dev/null | jq -r '.minions[]?' 2>/dev/null); do
+    [[ -n "$mid" ]] || continue
+    /usr/sbin/so-telegraf-cred add "$mid" || echo "  warning: so-telegraf-cred add $mid failed" >&2
+  done
+  # Run through the master (not --local) so state compilation uses the
+  # master's configured file_roots; the manager's /etc/salt/minion has no
+  # file_roots of its own and --local would fail with "No matching sls found".
+  salt-call state.apply postgres.telegraf_users queue=True || true
+
+  # Update default agent policies to use logging level warn.
+  elasticfleet_set_agent_logging_level_warn || true
+
+  # Check for unhealthy / unauthorized integration transform jobs and attempt reauthorizations
+  check_transform_health_and_reauthorize || true

  POSTVERSION=3.1.0
 }
@@ -663,15 +856,6 @@ upgrade_check_salt() {
 upgrade_salt() {
  echo "Performing upgrade of Salt from $INSTALLEDSALTVERSION to $NEWSALTVERSION."
  echo ""
-  # Check if salt-cloud is installed
-  if rpm -q salt-cloud &>/dev/null; then
-    SALT_CLOUD_INSTALLED=true
-  fi
-  # Check if salt-cloud is configured
-  if [[ -f /etc/salt/cloud.profiles.d/socloud.conf ]]; then
-    SALT_CLOUD_CONFIGURED=true
-  fi
-
  echo "Removing yum versionlock for Salt."
  echo ""
  yum versionlock delete "salt"
@@ -25,8 +25,33 @@ manager_run_es_soc:
        - salt: {{NEWNODE}}_update_mine
 {% endif %}

+# so-minion has already added the new minion's entry to telegraf/creds.sls
+# via so-telegraf-cred before this orch fires. Reconcile the Postgres role
+# on the manager so the new minion can authenticate on its first highstate,
+# then refresh the minion's pillar so its telegraf.conf renders with the
+# freshly-written cred.
+manager_create_postgres_telegraf_role:
+  salt.state:
+    - tgt: {{ MANAGER }}
+    - sls:
+      - postgres.telegraf_users
+    - queue: True
+    - require:
+      - salt: {{NEWNODE}}_update_mine
+
+{{NEWNODE}}_refresh_pillar:
+  salt.function:
+    - name: saltutil.refresh_pillar
+    - tgt: {{ NEWNODE }}
+    - kwarg:
+        wait: True
+    - require:
+      - salt: manager_create_postgres_telegraf_role
+
 {{NEWNODE}}_run_highstate:
  salt.state:
    - tgt: {{ NEWNODE }}
    - highstate: True
    - queue: True
+    - require:
+      - salt: {{NEWNODE}}_refresh_pillar
@@ -0,0 +1,37 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls in allowed_states %}
+
+  {% set DIGITS = "1234567890" %}
+  {% set LOWERCASE = "qwertyuiopasdfghjklzxcvbnm" %}
+  {% set UPPERCASE = "QWERTYUIOPASDFGHJKLZXCVBNM" %}
+  {% set SYMBOLS = "~!@#^&*()-_=+[]|;:,.<>?" %}
+  {% set CHARS = DIGITS~LOWERCASE~UPPERCASE~SYMBOLS %}
+  {% set so_postgres_user_pass = salt['pillar.get']('postgres:auth:users:so_postgres_user:pass', salt['random.get_str'](72, chars=CHARS)) %}
+
+# Admin cred only. Per-minion Telegraf creds live in telegraf/creds.sls,
+# managed by /usr/sbin/so-telegraf-cred (called from so-minion).
+postgres_auth_pillar:
+  file.managed:
+    - name: /opt/so/saltstack/local/pillar/postgres/auth.sls
+    - mode: 640
+    - reload_pillar: True
+    - contents: |
+        postgres:
+          auth:
+            users:
+              so_postgres_user:
+                user: so_postgres
+                pass: "{{ so_postgres_user_pass }}"
+    - show_changes: False
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,111 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
+{%   from 'postgres/map.jinja' import PGMERGED %}
+
+# Postgres Setup
+postgresconfdir:
+  file.directory:
+    - name: /opt/so/conf/postgres
+    - user: 939
+    - group: 939
+    - makedirs: True
+
+postgressecretsdir:
+  file.directory:
+    - name: /opt/so/conf/postgres/secrets
+    - user: 939
+    - group: 939
+    - mode: 700
+    - require:
+      - file: postgresconfdir
+
+postgresdatadir:
+  file.directory:
+    - name: /nsm/postgres
+    - user: 939
+    - group: 939
+    - makedirs: True
+
+postgreslogdir:
+  file.directory:
+    - name: /opt/so/log/postgres
+    - user: 939
+    - group: 939
+    - makedirs: True
+
+postgresinitdir:
+  file.directory:
+    - name: /opt/so/conf/postgres/init
+    - user: 939
+    - group: 939
+    - require:
+      - file: postgresconfdir
+
+postgresinitusers:
+  file.managed:
+    - name: /opt/so/conf/postgres/init/init-users.sh
+    - source: salt://postgres/files/init-users.sh
+    - user: 939
+    - group: 939
+    - mode: 755
+
+postgresconf:
+  file.managed:
+    - name: /opt/so/conf/postgres/postgresql.conf
+    - source: salt://postgres/files/postgresql.conf.jinja
+    - user: 939
+    - group: 939
+    - template: jinja
+    - defaults:
+        PGMERGED: {{ PGMERGED }}
+
+postgreshba:
+  file.managed:
+    - name: /opt/so/conf/postgres/pg_hba.conf
+    - source: salt://postgres/files/pg_hba.conf
+    - user: 939
+    - group: 939
+    - mode: 640
+
+postgres_super_secret:
+  file.managed:
+    - name: /opt/so/conf/postgres/secrets/postgres_password
+    - user: 939
+    - group: 939
+    - mode: 600
+    - contents_pillar: 'secrets:postgres_pass'
+    - show_changes: False
+    - require:
+      - file: postgressecretsdir
+
+postgres_app_secret:
+  file.managed:
+    - name: /opt/so/conf/postgres/secrets/so_postgres_pass
+    - user: 939
+    - group: 939
+    - mode: 600
+    - contents_pillar: 'postgres:auth:users:so_postgres_user:pass'
+    - show_changes: False
+    - require:
+      - file: postgressecretsdir
+
+postgres_sbin:
+  file.recurse:
+    - name: /usr/sbin
+    - source: salt://postgres/tools/sbin
+    - user: root
+    - group: root
+    - file_mode: 755
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,19 @@
+postgres:
+  enabled: True
+  telegraf:
+    retention_days: 14
+  config:
+    listen_addresses: '*'
+    port: 5432
+    max_connections: 100
+    shared_buffers: 256MB
+    ssl: 'on'
+    ssl_cert_file: '/conf/postgres.crt'
+    ssl_key_file: '/conf/postgres.key'
+    ssl_ca_file: '/conf/ca.crt'
+    hba_file: '/conf/pg_hba.conf'
+    log_destination: 'stderr'
+    logging_collector: 'off'
+    log_min_messages: 'warning'
+    shared_preload_libraries: pg_cron
+    cron.database_name: so_telegraf
@@ -0,0 +1,33 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
+
+include:
+  - postgres.sostatus
+
+so-postgres:
+  docker_container.absent:
+    - force: True
+
+so-postgres_so-status.disabled:
+  file.comment:
+    - name: /opt/so/conf/so-status/so-status.conf
+    - regex: ^so-postgres$
+
+so_postgres_backup:
+  cron.absent:
+    - name: /usr/sbin/so-postgres-backup > /dev/null 2>&1
+    - identifier: so_postgres_backup
+    - user: root
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,109 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
+{%   from 'vars/globals.map.jinja' import GLOBALS %}
+{%   from 'docker/docker.map.jinja' import DOCKERMERGED %}
+{%   set SO_POSTGRES_USER = salt['pillar.get']('postgres:auth:users:so_postgres_user:user', 'so_postgres') %}
+
+include:
+  - postgres.auth
+  - postgres.ssl
+  - postgres.config
+  - postgres.sostatus
+  - postgres.telegraf_users
+
+so-postgres:
+  docker_container.running:
+    - image: {{ GLOBALS.registry_host }}:5000/{{ GLOBALS.image_repo }}/so-postgres:{{ GLOBALS.so_version }}
+    - hostname: so-postgres
+    - networks:
+      - sobridge:
+        - ipv4_address: {{ DOCKERMERGED.containers['so-postgres'].ip }}
+    - port_bindings:
+      {% for BINDING in DOCKERMERGED.containers['so-postgres'].port_bindings %}
+      - {{ BINDING }}
+      {% endfor %}
+    - environment:
+      - POSTGRES_DB=securityonion
+      # Passwords are delivered via mounted 0600 secret files, not plaintext env vars.
+      # The upstream postgres image resolves POSTGRES_PASSWORD_FILE; entrypoint.sh and
+      # init-users.sh resolve SO_POSTGRES_PASS_FILE the same way.
+      - POSTGRES_PASSWORD_FILE=/run/secrets/postgres_password
+      - SO_POSTGRES_USER={{ SO_POSTGRES_USER }}
+      - SO_POSTGRES_PASS_FILE=/run/secrets/so_postgres_pass
+      {% if DOCKERMERGED.containers['so-postgres'].extra_env %}
+        {% for XTRAENV in DOCKERMERGED.containers['so-postgres'].extra_env %}
+      - {{ XTRAENV }}
+        {% endfor %}
+      {% endif %}
+    - binds:
+      - /opt/so/log/postgres/:/log:rw
+      - /nsm/postgres:/var/lib/postgresql/data:rw
+      - /opt/so/conf/postgres/postgresql.conf:/conf/postgresql.conf:ro
+      - /opt/so/conf/postgres/pg_hba.conf:/conf/pg_hba.conf:ro
+      - /opt/so/conf/postgres/secrets:/run/secrets:ro
+      - /opt/so/conf/postgres/init/init-users.sh:/docker-entrypoint-initdb.d/init-users.sh:ro
+      - /etc/pki/postgres.crt:/conf/postgres.crt:ro
+      - /etc/pki/postgres.key:/conf/postgres.key:ro
+      - /etc/pki/tls/certs/intca.crt:/conf/ca.crt:ro
+      {% if DOCKERMERGED.containers['so-postgres'].custom_bind_mounts %}
+        {% for BIND in DOCKERMERGED.containers['so-postgres'].custom_bind_mounts %}
+      - {{ BIND }}
+        {% endfor %}
+      {% endif %}
+    {% if DOCKERMERGED.containers['so-postgres'].extra_hosts %}
+    - extra_hosts:
+      {% for XTRAHOST in DOCKERMERGED.containers['so-postgres'].extra_hosts %}
+      - {{ XTRAHOST }}
+      {% endfor %}
+    {% endif %}
+    {% if DOCKERMERGED.containers['so-postgres'].ulimits %}
+    - ulimits:
+    {%   for ULIMIT in DOCKERMERGED.containers['so-postgres'].ulimits %}
+      - {{ ULIMIT.name }}={{ ULIMIT.soft }}:{{ ULIMIT.hard }}
+    {%   endfor %}
+    {% endif %}
+    - watch:
+      - file: postgresconf
+      - file: postgreshba
+      - file: postgresinitusers
+      - file: postgres_super_secret
+      - file: postgres_app_secret
+      - x509: postgres_crt
+      - x509: postgres_key
+    - require:
+      - file: postgresconf
+      - file: postgreshba
+      - file: postgresinitusers
+      - file: postgres_super_secret
+      - file: postgres_app_secret
+      - x509: postgres_crt
+      - x509: postgres_key
+
+delete_so-postgres_so-status.disabled:
+  file.uncomment:
+    - name: /opt/so/conf/so-status/so-status.conf
+    - regex: ^so-postgres$
+
+so_postgres_backup:
+  cron.present:
+    - name: /usr/sbin/so-postgres-backup > /dev/null 2>&1
+    - identifier: so_postgres_backup
+    - user: root
+    - minute: '5'
+    - hour: '0'
+    - daymonth: '*'
+    - month: '*'
+    - dayweek: '*'
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+# Create or update application user for SOC platform access
+# This script runs on first database initialization via docker-entrypoint-initdb.d
+# The password is properly escaped to handle special characters
+if [ -z "${SO_POSTGRES_PASS:-}" ] && [ -n "${SO_POSTGRES_PASS_FILE:-}" ] && [ -r "$SO_POSTGRES_PASS_FILE" ]; then
+    SO_POSTGRES_PASS="$(< "$SO_POSTGRES_PASS_FILE")"
+fi
+psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL
+    DO \$\$
+    BEGIN
+        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '${SO_POSTGRES_USER}') THEN
+            EXECUTE format('CREATE ROLE %I WITH LOGIN PASSWORD %L', '${SO_POSTGRES_USER}', '${SO_POSTGRES_PASS}');
+        ELSE
+            EXECUTE format('ALTER ROLE %I WITH PASSWORD %L', '${SO_POSTGRES_USER}', '${SO_POSTGRES_PASS}');
+        END IF;
+    END
+    \$\$;
+    GRANT ALL PRIVILEGES ON DATABASE "$POSTGRES_DB" TO "$SO_POSTGRES_USER";
+    -- Lock the SOC database down at the connect layer; PUBLIC gets CONNECT
+    -- by default, which would let per-minion telegraf roles open sessions
+    -- here. They have no schema/table grants inside so reads fail, but
+    -- revoking CONNECT closes the soft edge entirely.
+    REVOKE CONNECT ON DATABASE "$POSTGRES_DB" FROM PUBLIC;
+    GRANT CONNECT ON DATABASE "$POSTGRES_DB" TO "$SO_POSTGRES_USER";
+EOSQL
+
+# Bootstrap the Telegraf metrics database. Per-minion roles + schemas are
+# reconciled on every state.apply by postgres/telegraf_users.sls; this block
+# only ensures the shared database exists on first initialization.
+if ! psql -U "$POSTGRES_USER" -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
+    psql -v ON_ERROR_STOP=1 -U "$POSTGRES_USER" -c "CREATE DATABASE so_telegraf"
+fi
@@ -0,0 +1,16 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+#
+# Managed by Salt — do not edit by hand.
+# Client authentication config: only local (Unix socket) connections and TLS-wrapped TCP
+# connections are accepted. Plain-text `host ...` lines are intentionally omitted so a
+# misconfigured client with sslmode=disable cannot negotiate a cleartext session.
+
+# Local connections (Unix socket, container-internal) use peer/trust.
+local   all             all                                     trust
+
+# TCP connections MUST use TLS (hostssl) and authenticate with SCRAM.
+hostssl all             all             0.0.0.0/0               scram-sha-256
+hostssl all             all             ::/0                    scram-sha-256
@@ -0,0 +1,8 @@
+{# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+   or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+   https://securityonion.net/license; you may not use this file except in compliance with the
+   Elastic License 2.0. #}
+
+{% for key, value in PGMERGED.config.items() %}
+{{ key }} = '{{ value | string | replace("'", "''") }}'
+{% endfor %}
@@ -0,0 +1,13 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'postgres/map.jinja' import PGMERGED %}
+
+include:
+{% if PGMERGED.enabled %}
+  - postgres.enabled
+{% else %}
+  - postgres.disabled
+{% endif %}
@@ -0,0 +1,7 @@
+{# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+   or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+   https://securityonion.net/license; you may not use this file except in compliance with the
+   Elastic License 2.0. #}
+
+{% import_yaml 'postgres/defaults.yaml' as PGDEFAULTS %}
+{% set PGMERGED = salt['pillar.get']('postgres', PGDEFAULTS.postgres, merge=True) %}
@@ -0,0 +1,89 @@
+postgres:
+  enabled:
+    description: Whether the PostgreSQL database container is enabled on this grid. Backs the assistant store and the Telegraf metrics database.
+    forcedType: bool
+    readonly: True
+    helpLink: influxdb
+  telegraf:
+    retention_days:
+      description: Number of days of Telegraf metrics to keep in the so_telegraf database. Older partitions are dropped hourly by pg_partman.
+      forcedType: int
+      helpLink: postgres
+  config:
+    max_connections:
+      description: Maximum number of concurrent PostgreSQL connections.
+      forcedType: int
+      global: True
+      helpLink: postgres
+    shared_buffers:
+      description: Amount of memory PostgreSQL uses for shared buffers (e.g. 256MB, 1GB). Raising this improves read cache hit rate at the cost of system RAM.
+      global: True
+      helpLink: postgres
+    log_min_messages:
+      description: Minimum severity of server messages written to the PostgreSQL log.
+      options:
+        - debug1
+        - info
+        - notice
+        - warning
+        - error
+        - log
+        - fatal
+      global: True
+      helpLink: postgres
+    listen_addresses:
+      description: Interfaces PostgreSQL listens on. Must remain '*' so clients on the docker bridge network can connect.
+      global: True
+      advanced: True
+      helpLink: postgres
+    port:
+      description: TCP port PostgreSQL listens on inside the container. Firewall rules and container port mapping assume 5432.
+      forcedType: int
+      global: True
+      advanced: True
+      helpLink: postgres
+    ssl:
+      description: Whether PostgreSQL accepts TLS connections. Must remain 'on' — pg_hba.conf requires hostssl for TCP.
+      global: True
+      advanced: True
+      helpLink: postgres
+    ssl_cert_file:
+      description: Path (inside the container) to the TLS server certificate. Salt-managed.
+      global: True
+      advanced: True
+      helpLink: postgres
+    ssl_key_file:
+      description: Path (inside the container) to the TLS server private key. Salt-managed.
+      global: True
+      advanced: True
+      helpLink: postgres
+    ssl_ca_file:
+      description: Path (inside the container) to the CA bundle PostgreSQL uses to verify client certificates. Salt-managed.
+      global: True
+      advanced: True
+      helpLink: postgres
+    hba_file:
+      description: Path (inside the container) to the pg_hba.conf authentication file. Salt-managed — edit salt/postgres/files/pg_hba.conf.
+      global: True
+      advanced: True
+      helpLink: postgres
+    log_destination:
+      description: Where PostgreSQL writes its server log. 'stderr' routes to the container log stream.
+      global: True
+      advanced: True
+      helpLink: postgres
+    logging_collector:
+      description: Whether to run a separate logging collector process. Disabled because the docker log stream already captures stderr.
+      global: True
+      advanced: True
+      helpLink: postgres
+    shared_preload_libraries:
+      description: Comma-separated list of extensions loaded at server start. Required for pg_cron which drives pg_partman maintenance — do not remove.
+      global: True
+      advanced: True
+      helpLink: postgres
+    cron.database_name:
+      description: Database pg_cron schedules jobs in. Must be so_telegraf so partman maintenance runs in the right database context.
+      global: True
+      advanced: True
+      helpLink: postgres
@@ -0,0 +1,21 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
+
+append_so-postgres_so-status.conf:
+  file.append:
+    - name: /opt/so/conf/so-status/so-status.conf
+    - text: so-postgres
+    - unless: grep -q so-postgres /opt/so/conf/so-status/so-status.conf
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,55 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
+{%   from 'vars/globals.map.jinja' import GLOBALS %}
+{%   from 'ca/map.jinja' import CA %}
+
+postgres_key:
+  x509.private_key_managed:
+    - name: /etc/pki/postgres.key
+    - keysize: 4096
+    - backup: True
+    - new: True
+    {% if salt['file.file_exists']('/etc/pki/postgres.key') -%}
+    - prereq:
+      - x509: /etc/pki/postgres.crt
+    {%- endif %}
+    - retry:
+        attempts: 5
+        interval: 30
+
+postgres_crt:
+  x509.certificate_managed:
+    - name: /etc/pki/postgres.crt
+    - ca_server: {{ CA.server }}
+    - subjectAltName: DNS:{{ GLOBALS.hostname }}, IP:{{ GLOBALS.node_ip }}
+    - signing_policy: postgres
+    - private_key: /etc/pki/postgres.key
+    - CN: {{ GLOBALS.hostname }}
+    - days_remaining: 7
+    - days_valid: 820
+    - backup: True
+    - timeout: 30
+    - retry:
+        attempts: 5
+        interval: 30
+
+postgresKeyperms:
+  file.managed:
+    - replace: False
+    - name: /etc/pki/postgres.key
+    - mode: 400
+    - user: 939
+    - group: 939
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,157 @@
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'allowed_states.map.jinja' import allowed_states %}
+{% if sls.split('.')[0] in allowed_states %}
+{%   from 'vars/globals.map.jinja' import GLOBALS %}
+{%   from 'telegraf/map.jinja' import TELEGRAFMERGED %}
+
+{# postgres_wait_ready below requires `docker_container: so-postgres`, which is
+   declared in postgres.enabled. Include it here so state.apply postgres.telegraf_users
+   on its own (e.g. from orch.deploy_newnode) still has that ID in scope. Salt
+   de-duplicates the circular include. #}
+include:
+  - postgres.enabled
+
+{% set TG_OUT = TELEGRAFMERGED.output | upper %}
+{% if TG_OUT in ['POSTGRES', 'BOTH'] %}
+
+# docker_container.running returns as soon as the container starts, but on
+# first-init docker-entrypoint.sh starts a temporary postgres with
+# `listen_addresses=''` to run /docker-entrypoint-initdb.d scripts, then
+# shuts it down before exec'ing the real CMD. A default pg_isready check
+# (Unix socket) passes during that ephemeral phase and races the shutdown
+# with "the database system is shutting down". Checking TCP readiness on
+# 127.0.0.1 only succeeds after the final postgres binds the port.
+postgres_wait_ready:
+  cmd.run:
+    - name: |
+        for i in $(seq 1 60); do
+          if docker exec so-postgres pg_isready -h 127.0.0.1 -U postgres -q 2>/dev/null; then
+            exit 0
+          fi
+          sleep 2
+        done
+        echo "so-postgres did not accept TCP connections within 120s" >&2
+        exit 1
+    - require:
+      - docker_container: so-postgres
+
+# Ensure the shared Telegraf database exists. init-users.sh only runs on a
+# fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
+# would otherwise never get so_telegraf.
+postgres_create_telegraf_db:
+  cmd.run:
+    - name: |
+        if ! docker exec so-postgres psql -U postgres -tAc "SELECT 1 FROM pg_database WHERE datname='so_telegraf'" | grep -q 1; then
+          docker exec so-postgres psql -v ON_ERROR_STOP=1 -U postgres -c "CREATE DATABASE so_telegraf"
+        fi
+    - require:
+      - cmd: postgres_wait_ready
+
+# Provision the shared group role and schema once. Every per-minion role is a
+# member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf
+# (via options='-c role=so_telegraf' in the connection string) so tables created
+# on first write are owned by the group role and every member can INSERT/SELECT.
+postgres_telegraf_group_role:
+  cmd.run:
+    - name: |
+        docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN
+                CREATE ROLE so_telegraf NOLOGIN;
+            END IF;
+        END
+        $$;
+        GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf;
+        CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf;
+        GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf;
+        CREATE SCHEMA IF NOT EXISTS partman;
+        CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
+        CREATE EXTENSION IF NOT EXISTS pg_cron;
+        -- Telegraf (running as so_telegraf) calls partman.create_parent()
+        -- on first write of each metric, which needs USAGE on the partman
+        -- schema, EXECUTE on its functions/procedures, and write access to
+        -- partman.part_config so it can register new partitioned parents.
+        GRANT USAGE, CREATE ON SCHEMA partman TO so_telegraf;
+        GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA partman TO so_telegraf;
+        GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA partman TO so_telegraf;
+        GRANT EXECUTE ON ALL PROCEDURES IN SCHEMA partman TO so_telegraf;
+        -- partman creates per-parent template tables (partman.template_*) at
+        -- runtime; default privileges extend DML/sequence access to them.
+        ALTER DEFAULT PRIVILEGES IN SCHEMA partman
+            GRANT SELECT, INSERT, UPDATE, DELETE ON TABLES TO so_telegraf;
+        ALTER DEFAULT PRIVILEGES IN SCHEMA partman
+            GRANT USAGE, SELECT, UPDATE ON SEQUENCES TO so_telegraf;
+        -- Hourly partman maintenance. cron.schedule is idempotent by jobname.
+        SELECT cron.schedule(
+          'telegraf-partman-maintenance',
+          '17 * * * *',
+          'CALL partman.run_maintenance_proc()'
+        );
+        EOSQL
+    - require:
+      - cmd: postgres_create_telegraf_db
+
+{%   set creds = salt['pillar.get']('telegraf:postgres_creds', {}) %}
+{%   for mid, entry in creds.items() %}
+{%     if entry.get('user') and entry.get('pass') %}
+{%       set u = entry.user %}
+{%       set p = entry.pass | replace("'", "''") %}
+
+postgres_telegraf_role_{{ u }}:
+  cmd.run:
+    - name: |
+        docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
+        DO $$
+        BEGIN
+            IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{{ u }}') THEN
+                EXECUTE format('CREATE ROLE %I WITH LOGIN PASSWORD %L', '{{ u }}', '{{ p }}');
+            ELSE
+                EXECUTE format('ALTER ROLE %I WITH PASSWORD %L', '{{ u }}', '{{ p }}');
+            END IF;
+        END
+        $$;
+        GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}";
+        GRANT so_telegraf TO "{{ u }}";
+        EOSQL
+    - require:
+      - cmd: postgres_telegraf_group_role
+
+{%     endif %}
+{%   endfor %}
+
+# Reconcile partman retention from pillar. Runs after role/schema setup so
+# any partitioned parents Telegraf has already created get their retention
+# refreshed whenever postgres.telegraf.retention_days changes.
+{%   set retention = salt['pillar.get']('postgres:telegraf:retention_days', 14) | int %}
+postgres_telegraf_retention_reconcile:
+  cmd.run:
+    - name: |
+        docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
+        DO $$
+        BEGIN
+            IF EXISTS (SELECT 1 FROM pg_catalog.pg_extension WHERE extname = 'pg_partman') THEN
+                UPDATE partman.part_config
+                SET retention = '{{ retention }} days',
+                    retention_keep_table = false
+                WHERE parent_table LIKE 'telegraf.%';
+            END IF;
+        END
+        $$;
+        EOSQL
+    - require:
+      - cmd: postgres_telegraf_group_role
+
+{% endif %}
+
+{% else %}
+
+{{sls}}_state_not_allowed:
+  test.fail_without_changes:
+    - name: {{sls}}_state_not_allowed
+
+{% endif %}
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+# Backups contain role password hashes and full chat data; keep them 0600.
+umask 0077
+
+TODAY=$(date '+%Y_%m_%d')
+BACKUPDIR=/nsm/backup
+BACKUPFILE="$BACKUPDIR/so-postgres-backup-$TODAY.sql.gz"
+MAXBACKUPS=7
+
+mkdir -p $BACKUPDIR
+
+# Skip if already backed up today
+if [ -f "$BACKUPFILE" ]; then
+  exit 0
+fi
+
+# Skip if container isn't running
+if ! docker ps --format '{{.Names}}' | grep -q '^so-postgres$'; then
+  exit 0
+fi
+
+# Dump all databases and roles, compress
+docker exec so-postgres pg_dumpall -U postgres | gzip > "$BACKUPFILE"
+
+# Retention cleanup
+NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
+while [ "$NUMBACKUPS" -gt "$MAXBACKUPS" ]; do
+  OLDEST=$(find $BACKUPDIR -type f -name "so-postgres-backup*" -printf '%T+ %p\n' | sort | head -n 1 | awk -F" " '{print $2}')
+  rm -f "$OLDEST"
+  NUMBACKUPS=$(find $BACKUPDIR -type f -name "so-postgres-backup*" | wc -l)
+done
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+usage() {
+    echo "Usage: $0 <operation> [args]"
+    echo ""
+    echo "Supported Operations:"
+    echo "  sql          Execute a SQL command, requires: <sql>"
+    echo "  sqlfile      Execute a SQL file, requires: <path>"
+    echo "  shell        Open an interactive psql shell"
+    echo "  dblist       List databases"
+    echo "  userlist     List database roles"
+    echo ""
+    exit 1
+}
+
+if [ $# -lt 1 ]; then
+  usage
+fi
+
+# Check for prerequisites
+if [ "$(id -u)" -ne 0 ]; then
+    echo "This script must be run using sudo!"
+    exit 1
+fi
+
+COMMAND=$(basename $0)
+OP=$1
+shift
+
+set -eo pipefail
+
+log() {
+  echo -e "$(date) | $COMMAND | $@" >&2
+}
+
+so_psql() {
+  docker exec so-postgres psql -U postgres -d securityonion "$@"
+}
+
+case "$OP" in
+
+  sql)
+    [ $# -lt 1 ] && usage
+    so_psql -c "$1"
+    ;;
+
+  sqlfile)
+    [ $# -ne 1 ] && usage
+    if [ ! -f "$1" ]; then
+      log "File not found: $1"
+      exit 1
+    fi
+    docker cp "$1" so-postgres:/tmp/sqlfile.sql
+    docker exec so-postgres psql -U postgres -d securityonion -f /tmp/sqlfile.sql
+    docker exec so-postgres rm -f /tmp/sqlfile.sql
+    ;;
+
+  shell)
+    docker exec -it so-postgres psql -U postgres -d securityonion
+    ;;
+
+  dblist)
+    so_psql -c "\l"
+    ;;
+
+  userlist)
+    so_psql -c "\du"
+    ;;
+
+  *)
+    usage
+    ;;
+esac
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+/usr/sbin/so-restart postgres $1
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+/usr/sbin/so-start postgres $1
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+/usr/sbin/so-stop postgres $1
@@ -0,0 +1,157 @@
+#!/bin/bash
+
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+# Point-in-time host metrics from the Telegraf Postgres backend.
+# Sanity-check tool for verifying metrics are landing before the grid
+# dashboards consume them.
+#
+# Assumes Telegraf's postgresql output is configured with
+# tags_as_foreign_keys = true, tags_as_jsonb = true, fields_as_jsonb = true,
+# so metric tables are (time, tag_id, fields jsonb) and tag tables are
+# (tag_id, tags jsonb).
+
+. /usr/sbin/so-common
+
+usage() {
+  cat <<EOF
+Usage: $0 [host]
+
+Shows the most recent CPU, memory, disk, and load metrics for each host
+from the so_telegraf Postgres database. Without an argument, reports on
+every host that has data. With a host, limits output to that one.
+
+Requires: sudo, so-postgres running, telegraf.output set to
+POSTGRES or BOTH.
+EOF
+  exit 1
+}
+
+if [ "$(id -u)" -ne 0 ]; then
+  echo "This script must be run using sudo!"
+  exit 1
+fi
+
+case "${1:-}" in
+  -h|--help) usage ;;
+esac
+
+FILTER_HOST="${1:-}"
+SCHEMA="telegraf"
+
+# Host values are interpolated into SQL below. Hostnames are [A-Za-z0-9._-];
+# any other character in a tag value or CLI arg is rejected to prevent a
+# stored-tag (or CLI) → SQL injection via a compromised Telegraf writer.
+HOST_RE='^[A-Za-z0-9._-]+$'
+if [ -n "$FILTER_HOST" ] && ! [[ "$FILTER_HOST" =~ $HOST_RE ]]; then
+  echo "Invalid host filter: $FILTER_HOST" >&2
+  exit 1
+fi
+
+so_psql() {
+  docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
+}
+
+if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
+  echo "Database so_telegraf not found. Is telegraf.output set to POSTGRES or BOTH?"
+  exit 2
+fi
+
+table_exists() {
+  local table="$1"
+  [ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ]
+}
+
+# Discover hosts from cpu_tag (every minion reports cpu).
+if ! table_exists "cpu_tag"; then
+  echo "${SCHEMA}.cpu_tag not found. Has Telegraf written any rows yet?"
+  exit 0
+fi
+
+HOSTS=$(so_psql -c "
+  SELECT DISTINCT tags->>'host'
+  FROM \"${SCHEMA}\".cpu_tag
+  WHERE tags ? 'host'
+  ORDER BY 1;")
+
+if [ -z "$HOSTS" ]; then
+  echo "No hosts found in ${SCHEMA}. Is Telegraf configured to write to Postgres?"
+  exit 0
+fi
+
+print_metric() {
+  so_psql -c "$1"
+}
+
+for host in $HOSTS; do
+  if ! [[ "$host" =~ $HOST_RE ]]; then
+    echo "Skipping host with invalid characters in tag value: $host" >&2
+    continue
+  fi
+  if [ -n "$FILTER_HOST" ] && [ "$host" != "$FILTER_HOST" ]; then
+    continue
+  fi
+
+  echo "===================================================================="
+  echo " Host: $host"
+  echo "===================================================================="
+
+  if table_exists "cpu"; then
+    print_metric "
+      SELECT 'cpu          ' AS metric,
+             to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
+             round((100 - (c.fields->>'usage_idle')::numeric), 1) || '% used'
+      FROM \"${SCHEMA}\".cpu c
+      JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id)
+      WHERE t.tags->>'host' = '${host}' AND t.tags->>'cpu' = 'cpu-total'
+      ORDER BY c.time DESC LIMIT 1;"
+  fi
+
+  if table_exists "mem"; then
+    print_metric "
+      SELECT 'memory       ' AS metric,
+             to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
+             round((m.fields->>'used_percent')::numeric, 1) || '% used (' ||
+             pg_size_pretty((m.fields->>'used')::bigint) || ' of ' ||
+             pg_size_pretty((m.fields->>'total')::bigint) || ')'
+      FROM \"${SCHEMA}\".mem m
+      JOIN \"${SCHEMA}\".mem_tag t USING (tag_id)
+      WHERE t.tags->>'host' = '${host}'
+      ORDER BY m.time DESC LIMIT 1;"
+  fi
+
+  if table_exists "disk"; then
+    print_metric "
+      SELECT 'disk ' || rpad(t.tags->>'path', 12) AS metric,
+             to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
+             round((d.fields->>'used_percent')::numeric, 1) || '% used (' ||
+             pg_size_pretty((d.fields->>'used')::bigint) || ' of ' ||
+             pg_size_pretty((d.fields->>'total')::bigint) || ')'
+      FROM \"${SCHEMA}\".disk d
+      JOIN \"${SCHEMA}\".disk_tag t USING (tag_id)
+      WHERE t.tags->>'host' = '${host}'
+        AND d.time = (SELECT max(d2.time)
+                      FROM \"${SCHEMA}\".disk d2
+                      JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id)
+                      WHERE t2.tags->>'host' = '${host}')
+      ORDER BY t.tags->>'path';"
+  fi
+
+  if table_exists "system"; then
+    print_metric "
+      SELECT 'load         ' AS metric,
+             to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
+             (s.fields->>'load1') || ' / ' ||
+             (s.fields->>'load5') || ' / ' ||
+             (s.fields->>'load15') || ' (1/5/15m)'
+      FROM \"${SCHEMA}\".system s
+      JOIN \"${SCHEMA}\".system_tag t USING (tag_id)
+      WHERE t.tags->>'host' = '${host}'
+      ORDER BY s.time DESC LIMIT 1;"
+  fi
+
+  echo ""
+done
@@ -6,39 +6,74 @@
 # Elastic License 2.0.

 import logging
-from subprocess import call
-import yaml
+import os
+import re
+import shlex
+import subprocess

 log = logging.getLogger(__name__)

+SO_MINION = '/usr/sbin/so-minion'
+
+_NODETYPE_RE = re.compile(r'^[A-Z][A-Z0-9_]{0,31}$')
+_MINIONID_RE = re.compile(r'^[A-Za-z0-9._-]{1,253}$')
+_HOSTPART_RE = re.compile(r'^[A-Za-z0-9._-]{1,253}$')
+_IPV4_RE = re.compile(
+    r'^(?:(?:25[0-5]|2[0-4]\d|[01]?\d?\d)\.){3}'
+    r'(?:25[0-5]|2[0-4]\d|[01]?\d?\d)$'
+)
+_HEAP_RE = re.compile(r'^\d{1,6}[kKmMgG]?$')
+
+
+def _check(name, value, pattern):
+  s = str(value)
+  if not pattern.match(s):
+    raise ValueError("sominion_setup_reactor: refusing unsafe %s=%r" % (name, value))
+  return s
+
+
 def run():
  log.info('sominion_setup_reactor: Running')
  minionid = data['id']
  DATA = data['data']
-  hv_name = DATA['HYPERVISOR_HOST']
  log.info('sominion_setup_reactor: DATA: %s' % DATA)

-  # Build the base command
-  cmd = "NODETYPE=" + DATA['NODETYPE'] + " /usr/sbin/so-minion -o=addVM -m=" + minionid + " -n=" + DATA['MNIC'] + " -i=" + DATA['MAINIP'] + " -c=" + str(DATA['CPUCORES']) + " -d='" + DATA['NODE_DESCRIPTION'] + "'"
-  
-  # Add optional arguments only if they exist in DATA
+  nodetype = _check('NODETYPE', DATA['NODETYPE'], _NODETYPE_RE)
+
+  argv = [
+    SO_MINION,
+    '-o=addVM',
+    '-m=' + _check('minionid', minionid,        _MINIONID_RE),
+    '-n=' + _check('MNIC',     DATA['MNIC'],    _HOSTPART_RE),
+    '-i=' + _check('MAINIP',   DATA['MAINIP'],  _IPV4_RE),
+    '-c=' + str(int(DATA['CPUCORES'])),
+    '-d=' + str(DATA['NODE_DESCRIPTION']),
+  ]
+
  if 'CORECOUNT' in DATA:
-    cmd += " -C=" + str(DATA['CORECOUNT'])
-    
+    argv.append('-C=' + str(int(DATA['CORECOUNT'])))
+
  if 'INTERFACE' in DATA:
-    cmd += " -a=" + DATA['INTERFACE']
-  
+    argv.append('-a=' + _check('INTERFACE', DATA['INTERFACE'], _HOSTPART_RE))
+
  if 'ES_HEAP_SIZE' in DATA:
-    cmd += " -e=" + DATA['ES_HEAP_SIZE']
-  
+    argv.append('-e=' + _check('ES_HEAP_SIZE', DATA['ES_HEAP_SIZE'], _HEAP_RE))
+
  if 'LS_HEAP_SIZE' in DATA:
-    cmd += " -l=" + DATA['LS_HEAP_SIZE']
+    argv.append('-l=' + _check('LS_HEAP_SIZE', DATA['LS_HEAP_SIZE'], _HEAP_RE))

  if 'LSHOSTNAME' in DATA:
-    cmd += " -L=" + DATA['LSHOSTNAME']
-  
-  log.info('sominion_setup_reactor: Command: %s' % cmd)
-  rc = call(cmd, shell=True)
+    argv.append('-L=' + _check('LSHOSTNAME', DATA['LSHOSTNAME'], _HOSTPART_RE))
+
+  env = os.environ.copy()
+  env['NODETYPE'] = nodetype
+
+  log.info(
+    'sominion_setup_reactor: argv: %s (NODETYPE=%s)',
+    ' '.join(shlex.quote(a) for a in argv),
+    shlex.quote(nodetype),
+  )
+  rc = subprocess.call(argv, shell=False, env=env)

  log.info('sominion_setup_reactor: rc: %s' % rc)

@@ -27,6 +27,7 @@ sool9_{{host}}:
    log_file: /opt/so/log/salt/minion
  grains:
    hypervisor_host: {{host ~ "_" ~ role}}
+    sosmodel: HVGUEST
  preflight_cmds:
    - |
      {%- set hostnames = [MANAGERHOSTNAME] %}
@@ -3,6 +3,7 @@ soc:
    description: Enables or disables SOC. WARNING - Disabling this setting is unsupported and will cause the grid to malfunction. Re-enabling this setting is a manual effort via SSH.
    forcedType: bool
    advanced: True
+    readonly: True
  telemetryEnabled:
    title: SOC Telemetry
    description: When this setting is enabled and the grid is not in airgap mode, SOC will provide feature usage data to the Security Onion development team via Google Analytics. This data helps Security Onion developers determine which product features are being used and can also provide insight into improving the user interface. When changing this setting, wait for the grid to fully synchronize and then perform a hard browser refresh on SOC, to force the browser cache to update and reflect the new setting.
@@ -890,12 +891,16 @@ soc:
            suricata:
              description: The template used when creating a new Suricata detection. [publicId] will be replaced with an unused Public Id.
              multiline: True
+              forcedType: string
            strelka:
              description: The template used when creating a new Strelka detection.
              multiline: True
+              forcedType: string
            elastalert:
              description: The template used when creating a new ElastAlert detection. [publicId] will be replaced with an unused Public Id.
              multiline: True
+              forcedType: string
+
        grid:
          maxUploadSize:
            description: The maximum number of bytes for an uploaded PCAP import file.
@@ -261,7 +261,7 @@ strelka:
              priority: 5
              options:
                limit: 1000
-          'ScanLnk':
+          'ScanLNK':
            - positive:
                flavors:
                  - 'lnk_file'
@@ -99,7 +99,7 @@ strelka:
          'ScanJpeg': *scannerOptions
          'ScanJson': *scannerOptions
          'ScanLibarchive': *scannerOptions
-          'ScanLnk': *scannerOptions
+          'ScanLNK': *scannerOptions
          'ScanLsb': *scannerOptions
          'ScanLzma': *scannerOptions
          'ScanMacho': *scannerOptions
@@ -1,5 +1,6 @@
 telegraf:
  enabled: False
+  output: BOTH
  config:
    interval: '30s'
    metric_batch_size: 1000
@@ -8,6 +8,14 @@
 {%- set ZEEK_ENABLED = salt['pillar.get']('zeek:enabled', True) %}
 {%- set MDENGINE = GLOBALS.md_engine %}
 {%- set LOGSTASH_ENABLED = LOGSTASH_MERGED.enabled %}
+{%- set TG_OUT = TELEGRAFMERGED.output | upper %}
+{%- set PG_HOST = GLOBALS.manager_ip %}
+{#- Per-minion telegraf creds live in the grid-wide telegraf/creds.sls pillar,
+    written by /usr/sbin/so-telegraf-cred on the manager. Each minion looks up
+    its own entry by grains.id. #}
+{%- set PG_ENTRY = salt['pillar.get']('telegraf:postgres_creds:' ~ grains.id, {}) %}
+{%- set PG_USER = PG_ENTRY.get('user', '') %}
+{%- set PG_PASS = PG_ENTRY.get('pass', '') %}
 # Global tags can be specified here in key="value" format.
 [global_tags]
  role = "{{ GLOBALS.role.split('-') | last }}"
@@ -72,6 +80,7 @@
 #                            OUTPUT PLUGINS                                   #
 ###############################################################################

+{%- if TG_OUT in ['INFLUXDB', 'BOTH'] %}
 # Configuration for sending metrics to InfluxDB
 [[outputs.influxdb_v2]]
  urls = ["https://{{ INFLUXDBHOST }}:8086"]
@@ -85,6 +94,41 @@
   tls_key = "/etc/telegraf/telegraf.key"
  ## Use TLS but skip chain & host verification
  # insecure_skip_verify = false
+{%- endif %}
+
+{%- if TG_OUT in ['POSTGRES', 'BOTH'] and PG_USER and PG_PASS %}
+# Configuration for sending metrics to PostgreSQL.
+# options='-c role=so_telegraf' makes every connection SET ROLE to the shared
+# group role so tables created on first write are owned by so_telegraf, and
+# all per-minion members can INSERT/SELECT them via role inheritance.
+# fields_as_jsonb/tags_as_jsonb keep metric tables at a fixed column count so
+# high-cardinality inputs (docker, procstat, kafka) don't blow past the
+# Postgres 1600-column-per-table limit.
+[[outputs.postgresql]]
+  connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'"
+  schema = "telegraf"
+  tags_as_foreign_keys = true
+  tags_as_jsonb = true
+  fields_as_jsonb = true
+  # Every metric table is a daily time-range partitioned parent managed by
+  # pg_partman. Retention drops old partitions instead of row-by-row DELETEs.
+  {% raw %}
+  # pg_partman 5.x requires the control column (time) to be NOT NULL, so
+  # ALTER it before create_parent(). And create_parent() splits
+  # p_parent_table on '.' to look up raw identifiers, so the literal must
+  # be 'schema.name' (not '"schema"."name"' as .table|quoteLiteral emits).
+  # IF NOT EXISTS keeps the three templates idempotent so a Telegraf
+  # restart after any DB-side surgery re-runs them safely.
+  create_templates = [
+    '''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}) PARTITION BY RANGE ("time")''',
+    '''ALTER TABLE {{ .table }} ALTER COLUMN "time" SET NOT NULL''',
+    '''SELECT partman.create_parent(p_parent_table := {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3) WHERE NOT EXISTS (SELECT 1 FROM partman.part_config WHERE parent_table = {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }})'''
+  ]
+  tag_table_create_templates = [
+    '''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}, PRIMARY KEY (tag_id))'''
+  ]
+  {% endraw %}
+{%- endif %}

 ###############################################################################
 #                            PROCESSOR PLUGINS                                #
@@ -4,6 +4,15 @@ telegraf:
    forcedType: bool
    advanced: True
    helpLink: influxdb
+  output:
+    description: Selects the backend(s) Telegraf writes metrics to. INFLUXDB keeps the current behavior; POSTGRES writes to the grid's Postgres instance; BOTH dual-writes for migration validation.
+    options:
+      - INFLUXDB
+      - POSTGRES
+      - BOTH
+    global: True
+    advanced: True
+    helpLink: influxdb
  config:
    interval:
      description: Data collection interval.
@@ -68,6 +68,7 @@ base:
    - backup.config_backup
    - nginx
    - influxdb
+    - postgres
    - soc
    - kratos
    - hydra
@@ -95,6 +96,7 @@ base:
    - backup.config_backup
    - nginx
    - influxdb
+    - postgres
    - soc
    - kratos
    - hydra
@@ -123,6 +125,7 @@ base:
    - registry
    - nginx
    - influxdb
+    - postgres
    - strelka.manager
    - soc
    - kratos
@@ -153,6 +156,7 @@ base:
    - registry
    - nginx
    - influxdb
+    - postgres
    - strelka.manager
    - soc
    - kratos
@@ -181,6 +185,7 @@ base:
    - manager
    - nginx
    - influxdb
+    - postgres
    - strelka.manager
    - soc
    - kratos
@@ -1,4 +1,5 @@
 {% from 'vars/elasticsearch.map.jinja' import ELASTICSEARCH_GLOBALS %}
+{% from 'vars/postgres.map.jinja' import POSTGRES_GLOBALS %}
 {% from 'vars/sensor.map.jinja' import SENSOR_GLOBALS %}

 {% set ROLE_GLOBALS = {} %}
@@ -6,6 +7,7 @@
 {% set EVAL_GLOBALS =
   [
     ELASTICSEARCH_GLOBALS,
+     POSTGRES_GLOBALS,
     SENSOR_GLOBALS
   ]
 %}
@@ -1,4 +1,5 @@
 {% from 'vars/elasticsearch.map.jinja' import ELASTICSEARCH_GLOBALS %}
+{% from 'vars/postgres.map.jinja' import POSTGRES_GLOBALS %}
 {% from 'vars/sensor.map.jinja' import SENSOR_GLOBALS %}

 {% set ROLE_GLOBALS = {} %}
@@ -6,6 +7,7 @@
 {% set IMPORT_GLOBALS =
   [
     ELASTICSEARCH_GLOBALS,
+     POSTGRES_GLOBALS,
     SENSOR_GLOBALS
   ]
 %}
@@ -1,12 +1,14 @@
 {% from 'vars/elasticsearch.map.jinja' import ELASTICSEARCH_GLOBALS %}
 {% from 'vars/logstash.map.jinja' import LOGSTASH_GLOBALS %}
+{% from 'vars/postgres.map.jinja' import POSTGRES_GLOBALS %}

 {% set ROLE_GLOBALS = {} %}

 {% set MANAGER_GLOBALS =
   [
     ELASTICSEARCH_GLOBALS,
-     LOGSTASH_GLOBALS
+     LOGSTASH_GLOBALS,
+     POSTGRES_GLOBALS
   ]
 %}

@@ -1,12 +1,14 @@
 {% from 'vars/elasticsearch.map.jinja' import ELASTICSEARCH_GLOBALS %}
 {% from 'vars/logstash.map.jinja' import LOGSTASH_GLOBALS %}
+{% from 'vars/postgres.map.jinja' import POSTGRES_GLOBALS %}

 {% set ROLE_GLOBALS = {} %}

 {% set MANAGERSEARCH_GLOBALS =
   [
     ELASTICSEARCH_GLOBALS,
-     LOGSTASH_GLOBALS
+     LOGSTASH_GLOBALS,
+     POSTGRES_GLOBALS
   ]
 %}

@@ -0,0 +1,16 @@
+{# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+   or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
+   https://securityonion.net/license; you may not use this file except in compliance with the
+   Elastic License 2.0. #}
+
+{% import 'vars/init.map.jinja' as INIT %}
+
+{%
+  set POSTGRES_GLOBALS = {
+    'postgres': {}
+  }
+%}
+
+{% if salt['file.file_exists']('/opt/so/saltstack/local/pillar/postgres/auth.sls') %}
+{%   do POSTGRES_GLOBALS.postgres.update({'auth': INIT.PILLAR.postgres.auth}) %}
+{% endif %}
@@ -1,5 +1,6 @@
 {% from 'vars/elasticsearch.map.jinja' import ELASTICSEARCH_GLOBALS %}
 {% from 'vars/logstash.map.jinja' import LOGSTASH_GLOBALS %}
+{% from 'vars/postgres.map.jinja' import POSTGRES_GLOBALS %}
 {% from 'vars/sensor.map.jinja' import SENSOR_GLOBALS %}

 {% set ROLE_GLOBALS = {} %}
@@ -8,6 +9,7 @@
   [
     ELASTICSEARCH_GLOBALS,
     LOGSTASH_GLOBALS,
+     POSTGRES_GLOBALS,
     SENSOR_GLOBALS
   ]
 %}
@@ -202,10 +202,10 @@ check_service_status() {
 	systemctl status $service_name > /dev/null 2>&1
 	local status=$?
 	if [ $status -gt 0 ]; then
-		info "  $service_name is not running" 
+		info "$service_name is not running" 
 		return 1;
 	else
-		info "  $service_name is running"
+		info "$service_name is running"
 		return 0;
 	fi

@@ -745,6 +745,56 @@ configure_network_sensor() {
 	return $err
 }

+configure_management_bond() {
+	local bond_name="bond1"
+	local bond_mode=${MBOND_MODE:-active-backup}
+
+	info "Setting up $bond_name management interface with mode $bond_mode"
+
+	if [[ ${#MBNICS[@]} -eq 0 ]]; then
+		error "[ERROR] No management bond NICs were selected."
+		fail_setup
+	fi
+
+	nmcli -t -f NAME con show | grep -Fxq "$bond_name"
+	local found_int=$?
+
+	if [[ $found_int != 0 ]]; then
+		nmcli con add type bond ifname "$bond_name" con-name "$bond_name" mode "$bond_mode" -- \
+			ipv6.method ignore \
+			connection.autoconnect yes >> "$setup_log" 2>&1
+	else
+		nmcli con mod "$bond_name" \
+			bond.options "mode=$bond_mode" \
+			ipv6.method ignore \
+			connection.autoconnect yes >> "$setup_log" 2>&1
+	fi
+
+	local err=0
+	for MBNIC in "${MBNICS[@]}"; do
+		local slave_name="$bond_name-slave-$MBNIC"
+
+		nmcli -t -f NAME con show | grep -Fxq "$slave_name"
+		found_int=$?
+
+		if [[ $found_int != 0 ]]; then
+			nmcli con add type ethernet ifname "$MBNIC" con-name "$slave_name" master "$bond_name" -- \
+				connection.autoconnect yes >> "$setup_log" 2>&1
+		else
+			nmcli con mod "$slave_name" \
+				connection.master "$bond_name" \
+				connection.slave-type bond \
+				connection.autoconnect yes >> "$setup_log" 2>&1
+		fi
+
+		nmcli con up "$slave_name" >> "$setup_log" 2>&1
+		local ret=$?
+		[[ $ret -eq 0 ]] || err=$ret
+	done
+
+	return $err
+}
+
 configure_hyper_bridge() {
 	info "Setting up hypervisor bridge"
 	info "Checking $MNIC ipv4.method is auto or manual"
@@ -821,6 +871,7 @@ create_manager_pillars() {
 	soc_pillar
 	idh_pillar
 	influxdb_pillar
+	postgres_pillar
 	logrotate_pillar
 	patch_pillar
 	nginx_pillar
@@ -998,6 +1049,11 @@ filter_unused_nics() {
 			grep_string="$grep_string\|$BONDNIC"
 		done
 	fi
+	if [[ $MBNICS ]]; then
+		for BONDNIC in "${MBNICS[@]}"; do
+			grep_string="$grep_string\|$BONDNIC"
+		done
+	fi

 	# Finally, set filtered_nics to any NICs we aren't using (and ignore interfaces that aren't of use)
 	filtered_nics=$(ip link | awk -F: '$0 !~ "lo|vir|veth|br|docker|wl|^[^0-9]"{print $2}' | grep -vwe "$grep_string"  | sed 's/ //g' | sed -r 's/(.*)(\.[0-9]+)@\1/\1\2/g')
@@ -1053,6 +1109,7 @@ generate_passwords(){
  HYDRAKEY=$(get_random_value)
  HYDRASALT=$(get_random_value)
  REDISPASS=$(get_random_value)
+  POSTGRESPASS=$(get_random_value)
  SOCSRVKEY=$(get_random_value 64)
  IMPORTPASS=$(get_random_value)
 }
@@ -1355,6 +1412,12 @@ influxdb_pillar() {
 		"  token: $INFLUXTOKEN" > $local_salt_dir/pillar/influxdb/token.sls
 }

+postgres_pillar() {
+	title "Create the postgres pillar file"
+	touch $adv_postgres_pillar_file
+	touch $postgres_pillar_file
+}
+
 make_some_dirs() {
 	mkdir -p /nsm
 	mkdir -p "$default_salt_dir"
@@ -1364,7 +1427,7 @@ make_some_dirs() {
 	mkdir -p $local_salt_dir/salt/firewall/portgroups
 	mkdir -p $local_salt_dir/salt/firewall/ports

-	for THEDIR in bpf elasticsearch ntp firewall redis backup influxdb strelka sensoroni soc docker zeek suricata nginx telegraf logstash soc manager kratos hydra idh elastalert stig global kafka versionlock hypervisor vm; do
+	for THEDIR in bpf elasticsearch ntp firewall redis backup influxdb postgres strelka sensoroni soc docker zeek suricata nginx telegraf logstash soc manager kratos hydra idh elastalert stig global kafka versionlock hypervisor vm; do
 	mkdir -p $local_salt_dir/pillar/$THEDIR
 	touch $local_salt_dir/pillar/$THEDIR/adv_$THEDIR.sls
 	touch $local_salt_dir/pillar/$THEDIR/soc_$THEDIR.sls
@@ -1380,7 +1443,7 @@ network_init() {
 	title "Initializing Network"
 	disable_ipv6
 	set_hostname
-	if [[ ( $is_iso || $is_desktop_iso ) ]]; then
+	if [[ $is_iso || $is_desktop_iso ]]; then
 		set_management_interface
 	fi
 }
@@ -1541,13 +1604,8 @@ clear_previous_setup_results() {
 reinstall_init() {
 	info "Putting system in state to run setup again"

-	if [[ $install_type =~ ^(MANAGER|EVAL|MANAGERSEARCH|MANAGERHYPE|STANDALONE|FLEET|IMPORT)$ ]]; then
-		local salt_services=( "salt-master" "salt-minion" )
-	else
-		local salt_services=( "salt-minion" )
-	fi
-
-	local service_retry_count=20
+	# Always include both services. check_service_status skips units that aren't present.
+	local salt_services=( "salt-master" "salt-minion" )

 	{
 		# remove all of root's cronjobs
@@ -1563,31 +1621,51 @@ reinstall_init() {

 		salt-call state.apply ca.remove -linfo --local --file-root=../salt

-		# Kill any salt processes (safely)
+		# Stop salt services and force-kill any lingering salt processes (including orphans
+		# from an earlier reinstall attempt where the unit file is gone but processes survive)
+		# so dnf remove salt can run cleanly
 		for service in "${salt_services[@]}"; do
-			# Stop the service in the background so we can exit after a certain amount of time
 			if check_service_status "$service"; then
-				systemctl stop "$service" &
+				info "Stopping $service via systemctl"
+				systemctl stop "$service"
 			fi
-			local pid=$!
-
-			local count=0
-			while check_service_status "$service"; do
-				if [[ $count -gt $service_retry_count ]]; then
-					echo "Could not stop $service after 1 minute, exiting setup."
-
-					# Stop the systemctl process trying to kill the service, show user a message, then exit setup
-					kill -9 $pid
-					fail_setup
-				fi
-				
-				sleep 5
-				((count++))
-			done
 		done

+		# Unconditionally force-kill any remaining salt binaries — these may be orphaned
+		# from a prior aborted reinstall (no unit file, so systemctl can't see them).
+		for salt_bin in salt-master salt-minion salt-call salt-cloud; do
+			if pgrep -f "/usr/bin/${salt_bin}" > /dev/null 2>&1; then
+				info "Force-killing lingering $salt_bin processes"
+				pkill -9 -ef "/usr/bin/${salt_bin}" 2>/dev/null
+			fi
+		done
+		# Catch stray `salt` CLI children from saltutil.kill_all_jobs / state.apply invocations
+		pkill -9 -ef "/usr/bin/python3 /bin/salt" 2>/dev/null
+
+		# Give the kernel a moment to reap the killed processes before dnf removes the binaries
+		local kill_wait=0
+		while pgrep -f "/usr/bin/salt-" > /dev/null 2>&1; do
+			if [[ $kill_wait -gt 10 ]]; then
+				info "Salt processes still present after SIGKILL + 10s wait; proceeding anyway"
+				pgrep -af "/usr/bin/salt-" | while read -r line; do info "  lingering: $line"; done
+				break
+			fi
+			sleep 1
+			((kill_wait++))
+		done
+
+		# Clear the 'failed' state SIGKILL left on the units before removing the package
+		systemctl reset-failed salt-master.service salt-minion.service 2>/dev/null || true
+
 		# Remove all salt configs
-		rm -rf /etc/salt/engines/* /etc/salt/grains /etc/salt/master /etc/salt/master.d/* /etc/salt/minion /etc/salt/minion.d/* /etc/salt/pki/* /etc/salt/proxy /etc/salt/proxy.d/* /var/cache/salt/
+		dnf -y remove salt
+		rm -rf /etc/salt/ /var/cache/salt/
+
+		# Drop systemd's in-memory references to the now-removed units
+		systemctl daemon-reload
+
+		# Uninstall local Elastic Agent, if installed
+		elastic-agent uninstall -f

 		if command -v docker &> /dev/null; then
 			# Stop and remove all so-* containers so files can be changed with more safety
@@ -1611,10 +1689,7 @@ reinstall_init() {
 		backup_dir /nsm/hydra "$date_string"
 		backup_dir /nsm/influxdb "$date_string"

-		# Uninstall local Elastic Agent, if installed
-		elastic-agent uninstall -f
-
-	} >> "$setup_log" 2>&1
+	} 2>&1 | tee -a "$setup_log"

 	info "System reinstall init has been completed."
 }
@@ -1832,7 +1907,8 @@ secrets_pillar(){
 	printf '%s\n'\
 		"secrets:"\
 		"  import_pass: $IMPORTPASS"\
-		"  influx_pass: $INFLUXPASS" > $local_salt_dir/pillar/secrets.sls
+		"  influx_pass: $INFLUXPASS"\
+		"  postgres_pass: $POSTGRESPASS" > $local_salt_dir/pillar/secrets.sls
  fi
 }

@@ -2063,8 +2139,12 @@ set_initial_firewall_access() {
 # Set up the management interface on the ISO
 set_management_interface() {
 	title "Setting up the main interface"
+	if [[ $MNIC == "bond1" ]]; then
+		configure_management_bond || fail_setup
+	fi
+
 	if [ "$address_type" = 'DHCP' ]; then
-		logCmd "nmcli con mod $MNIC connection.autoconnect yes"
+		logCmd "nmcli con mod $MNIC connection.autoconnect yes ipv4.method auto"
 		logCmd "nmcli con up $MNIC"
 		logCmd "nmcli -p connection show $MNIC"
 	else
@@ -219,7 +219,7 @@ if [ -n "$test_profile" ]; then
 	WEBUSER=onionuser@somewhere.invalid
 	WEBPASSWD1=0n10nus3r
 	WEBPASSWD2=0n10nus3r
-	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MAINIP}"
+	NODE_DESCRIPTION="${HOSTNAME} - ${install_type} - ${MSRVIP_OFFSET}"

 	update_sudoers_for_testing
 fi
@@ -202,6 +202,12 @@ export influxdb_pillar_file
 adv_influxdb_pillar_file="$local_salt_dir/pillar/influxdb/adv_influxdb.sls"
 export adv_influxdb_pillar_file

+postgres_pillar_file="$local_salt_dir/pillar/postgres/soc_postgres.sls"
+export postgres_pillar_file
+
+adv_postgres_pillar_file="$local_salt_dir/pillar/postgres/adv_postgres.sls"
+export adv_postgres_pillar_file
+
 logrotate_pillar_file="$local_salt_dir/pillar/logrotate/soc_logrotate.sls"
 export logrotate_pillar_file

@@ -71,7 +71,8 @@ log_has_errors() {
        grep -vE "remove_failed_vm.sls" | \
        grep -vE "failed to copy: httpReadSeeker" | \
        grep -vE "Error response from daemon: failed to resolve reference" | \
-        grep -vE "log-.*-pipeline_failed_attempts" &> "$error_log"
+        grep -vE "log-.*-pipeline_failed_attempts" | \
+        grep -vE " -v ON_ERROR_STOP=1" &> "$error_log"
    
    if [[ $? -eq 0 ]]; then
        # This function succeeds (returns 0) if errors are detected
@@ -845,18 +845,99 @@ whiptail_management_nic() {
 	[ -n "$TESTING" ] && return

 	filter_unused_nics
+	local management_nic_options=( "${nic_list_management[@]}" )
+	if [[ $is_iso || $is_desktop_iso ]]; then
+		management_nic_options+=( "BOND" "Configure a bonded management interface" )
+	fi

-	MNIC=$(whiptail --title "$whiptail_title" --menu "Please select the NIC you would like to use for management.\n\nUse the arrow keys to move around and the Enter key to select." 20 75 12 "${nic_list_management[@]}" 3>&1 1>&2 2>&3 )	
+	MNIC=$(whiptail --title "$whiptail_title" --menu "Please select the NIC you would like to use for management.\n\nUse the arrow keys to move around and the Enter key to select." 20 75 12 "${management_nic_options[@]}" 3>&1 1>&2 2>&3 )
 	local exitstatus=$?
 	whiptail_check_exitstatus $exitstatus

 	while [ -z "$MNIC" ]
 	do
-		MNIC=$(whiptail --title "$whiptail_title" --menu "Please select the NIC you would like to use for management.\n\nUse the arrow keys to move around and the Enter key to select." 22 75 12 "${nic_list_management[@]}" 3>&1 1>&2 2>&3 )	
+		MNIC=$(whiptail --title "$whiptail_title" --menu "Please select the NIC you would like to use for management.\n\nUse the arrow keys to move around and the Enter key to select." 22 75 12 "${management_nic_options[@]}" 3>&1 1>&2 2>&3 )
 		local exitstatus=$?
 		whiptail_check_exitstatus $exitstatus
 	done

+	if [[ $MNIC == "BOND" ]]; then
+		whiptail_management_bond
+	fi
+}
+
+whiptail_management_bond() {
+
+	[ -n "$TESTING" ] && return
+
+	MBOND_MODE=$(whiptail --title "$whiptail_title" --menu \
+	"Choose the bond mode for the management interface.\n\nThe management bond will be created as bond1." 20 75 7 \
+	"active-backup" "One active NIC with failover (recommended)" \
+	"balance-rr" "Round-robin transmit policy" \
+	"balance-xor" "Transmit based on selected hash policy" \
+	"broadcast" "Transmit everything on all slave interfaces" \
+	"802.3ad" "Dynamic link aggregation (requires switch support)" \
+	"balance-tlb" "Adaptive transmit load balancing" \
+	"balance-alb" "Adaptive load balancing" 3>&1 1>&2 2>&3)
+	local exitstatus=$?
+	whiptail_check_exitstatus $exitstatus
+
+	while [ -z "$MBOND_MODE" ]
+	do
+		MBOND_MODE=$(whiptail --title "$whiptail_title" --menu \
+		"Choose the bond mode for the management interface.\n\nThe management bond will be created as bond1." 20 75 7 \
+		"active-backup" "One active NIC with failover (recommended)" \
+		"balance-rr" "Round-robin transmit policy" \
+		"balance-xor" "Transmit based on selected hash policy" \
+		"broadcast" "Transmit everything on all slave interfaces" \
+		"802.3ad" "Dynamic link aggregation (requires switch support)" \
+		"balance-tlb" "Adaptive transmit load balancing" \
+		"balance-alb" "Adaptive load balancing" 3>&1 1>&2 2>&3)
+		local exitstatus=$?
+		whiptail_check_exitstatus $exitstatus
+	done
+
+	whiptail_management_bond_nics
+	MNIC="bond1"
+
+	export MBOND_MODE MNIC
+}
+
+whiptail_management_bond_nics() {
+
+	[ -n "$TESTING" ] && return
+
+	MBNICS=()
+	filter_unused_nics
+
+	MBNICS=$(whiptail --title "$whiptail_title" --checklist "Please add NICs to the Management Interface:" 20 75 12 "${nic_list[@]}" 3>&1 1>&2 2>&3)
+	local exitstatus=$?
+	whiptail_check_exitstatus $exitstatus
+
+	while [ -z "$MBNICS" ]
+	do
+		MBNICS=$(whiptail --title "$whiptail_title" --checklist "Please add NICs to the Management Interface:" 20 75 12 "${nic_list[@]}" 3>&1 1>&2 2>&3)
+		local exitstatus=$?
+		whiptail_check_exitstatus $exitstatus
+	done
+
+	MBNICS=$(echo "$MBNICS" | tr -d '"')
+
+	IFS=' ' read -ra MBNICS <<< "$MBNICS"
+
+	for bond_nic in "${MBNICS[@]}"; do
+		for dev_status in "${nmcli_dev_status_list[@]}"; do
+			if [[ $dev_status == "${bond_nic}:unmanaged" ]]; then
+				whiptail \
+					--title "$whiptail_title" \
+					--msgbox "$bond_nic is unmanaged by Network Manager. Please remove it from other network management tools then re-run setup." \
+					8 75
+				exit
+			fi
+		done
+	done
+
+	export MBNICS
 }

 whiptail_net_method() {