From 614f32c5e087655c389e1e4ce215ed064b8c7bcc Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Wed, 22 Apr 2026 10:55:15 -0400 Subject: [PATCH] Split postgres auth from per-minion telegraf creds The old flow had two writers for each per-minion Telegraf password (so-minion wrote the minion pillar; postgres.auth regenerated any missing aggregate entries). They drifted on first-boot and there was no trigger to create DB roles when a new minion joined. Split responsibilities: - pillar/postgres/auth.sls (manager-scoped) keeps only the so_postgres admin cred. - pillar/telegraf/creds.sls (grid-wide) holds a {minion_id: {user, pass}} map, shadowed per-install by the local-pillar copy. - salt/manager/tools/sbin/so-telegraf-cred is the single writer: flock, atomic YAML write, PyYAML safe_dump so passwords never round-trip through so-yaml.py's type coercion. Idempotent add, quiet remove. - so-minion's add/remove hooks now shell out to so-telegraf-cred instead of editing pillar files directly. - postgres.telegraf_users iterates the new pillar key and CREATE/ALTERs roles from it; telegraf.conf reads its own entry via grains.id. - orch.deploy_newnode runs postgres.telegraf_users on the manager and refreshes the new minion's pillar before the new node highstates, so the DB role is in place the first time telegraf tries to connect. - soup's post_to_3.1.0 backfills the creds pillar from accepted salt keys (idempotent) and runs postgres.telegraf_users once to reconcile the DB. --- pillar/telegraf/creds.sls | 12 ++ pillar/top.sls | 1 + salt/manager/tools/sbin/so-minion | 52 ++------ salt/manager/tools/sbin/so-telegraf-cred | 159 +++++++++++++++++++++++ salt/manager/tools/sbin/soup | 36 ++--- salt/orch/deploy_newnode.sls | 25 ++++ salt/postgres/auth.sls | 25 +--- salt/postgres/telegraf_users.sls | 8 +- salt/telegraf/etc/telegraf.conf | 12 +- 9 files changed, 233 insertions(+), 97 deletions(-) create mode 100644 pillar/telegraf/creds.sls create mode 100644 salt/manager/tools/sbin/so-telegraf-cred diff --git a/pillar/telegraf/creds.sls b/pillar/telegraf/creds.sls new file mode 100644 index 000000000..8521bfbd9 --- /dev/null +++ b/pillar/telegraf/creds.sls @@ -0,0 +1,12 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Per-minion Telegraf Postgres credentials. so-telegraf-cred on the manager is +# the single writer; it mutates /opt/so/saltstack/local/pillar/telegraf/creds.sls +# under flock. Pillar_roots order (local before default) means the populated +# copy shadows this default on any real grid; this file exists so the pillar +# key is always defined on fresh installs and when no minions have creds yet. +telegraf: + postgres_creds: {} diff --git a/pillar/top.sls b/pillar/top.sls index 808182c2b..712629dbf 100644 --- a/pillar/top.sls +++ b/pillar/top.sls @@ -17,6 +17,7 @@ base: - sensoroni.adv_sensoroni - telegraf.soc_telegraf - telegraf.adv_telegraf + - telegraf.creds - versionlock.soc_versionlock - versionlock.adv_versionlock - soc.license diff --git a/salt/manager/tools/sbin/so-minion b/salt/manager/tools/sbin/so-minion index 4095637c8..86bab25e6 100755 --- a/salt/manager/tools/sbin/so-minion +++ b/salt/manager/tools/sbin/so-minion @@ -281,22 +281,18 @@ function deleteMinionFiles () { fi } -# Remove this minion's postgres Telegraf credential from both the aggregate -# pillar and the postgres database. Paired with add_telegraf_to_minion: -# add/delete cycle both here and in the DB. Always returns 0 so a dead or -# unreachable so-postgres doesn't block minion deletion — in that case we +# Remove this minion's postgres Telegraf credential from the shared creds +# pillar and drop the matching role in Postgres. Always returns 0 so a dead +# or unreachable so-postgres doesn't block minion deletion — in that case we # log a warning and leave the role behind for manual cleanup. function remove_postgres_telegraf_from_minion() { local MINION_SAFE MINION_SAFE=$(echo "$MINION_ID" | tr '.-' '__' | tr '[:upper:]' '[:lower:]') local PG_USER="so_telegraf_${MINION_SAFE}" - local AGGREGATE=/opt/so/saltstack/local/pillar/postgres/auth.sls log "INFO" "Removing postgres telegraf cred for $MINION_ID" - if [[ -f "$AGGREGATE" ]]; then - so-yaml.py remove "$AGGREGATE" "postgres.auth.users.telegraf_${MINION_SAFE}" >/dev/null 2>&1 || true - fi + so-telegraf-cred remove "$MINION_ID" >/dev/null 2>&1 || true if docker ps --format '{{.Names}}' 2>/dev/null | grep -q '^so-postgres$'; then if ! docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf >/dev/null 2>&1 <.{user,pass} into the aggregate - # pillar so postgres.telegraf_users CREATE ROLE finds it. - # - # An existing password is reused if the aggregate already has one (re-add), - # so rerunning so-minion for the same minion keeps the cred stable. - local MINION_SAFE - MINION_SAFE=$(echo "$MINION_ID" | tr '.-' '__' | tr '[:upper:]' '[:lower:]') - local PG_USER="so_telegraf_${MINION_SAFE}" - local AGGREGATE=/opt/so/saltstack/local/pillar/postgres/auth.sls - local PG_PASS="" - if [[ -f "$AGGREGATE" ]]; then - PG_PASS=$(so-yaml.py get -r "$AGGREGATE" "postgres.auth.users.telegraf_${MINION_SAFE}.pass" 2>/dev/null || true) - fi - if [[ -z "$PG_PASS" ]]; then - PG_PASS=$(tr -dc 'A-Za-z0-9~!@#^&*()_=+[]|;:,.<>?-' < /dev/urandom | head -c 72) - fi - - so-yaml.py replace "$PILLARFILE" postgres.telegraf.user "$PG_USER" >/dev/null - so-yaml.py replace "$PILLARFILE" postgres.telegraf.pass "$PG_PASS" >/dev/null - if [[ -f "$AGGREGATE" ]]; then - so-yaml.py replace "$AGGREGATE" "postgres.auth.users.telegraf_${MINION_SAFE}.user" "$PG_USER" >/dev/null - so-yaml.py replace "$AGGREGATE" "postgres.auth.users.telegraf_${MINION_SAFE}.pass" "$PG_PASS" >/dev/null + # Provision the per-minion postgres Telegraf credential in the shared + # telegraf/creds.sls pillar. so-telegraf-cred is the only writer; it + # generates a password on first add and is a no-op on re-add so the cred + # is stable across repeated so-minion runs. postgres.telegraf_users on the + # manager creates/updates the DB role from the same pillar. + so-telegraf-cred add "$MINION_ID" + if [ $? -ne 0 ]; then + log "ERROR" "Failed to provision postgres telegraf cred for $MINION_ID" + return 1 fi } diff --git a/salt/manager/tools/sbin/so-telegraf-cred b/salt/manager/tools/sbin/so-telegraf-cred new file mode 100644 index 000000000..35ff7c438 --- /dev/null +++ b/salt/manager/tools/sbin/so-telegraf-cred @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +""" +Single writer for the Telegraf Postgres credentials pillar. + +Maintains /opt/so/saltstack/local/pillar/telegraf/creds.sls with shape: + + telegraf: + postgres_creds: + : + user: so_telegraf_ + pass: "<72-char random>" + ... + +Called by so-minion on add/delete. PyYAML safe_dump preserves ambiguous +strings as quoted scalars, so passwords never round-trip through type +coercion (unlike so-yaml.py, which would). All mutations are serialized +by an flock on a sibling .creds.lock file. +""" + +import fcntl +import os +import pwd +import secrets +import string +import sys +import tempfile + +import yaml + +CREDS_PATH = "/opt/so/saltstack/local/pillar/telegraf/creds.sls" +LOCK_PATH = "/opt/so/saltstack/local/pillar/telegraf/.creds.lock" +OWNER_USER = "socore" +OWNER_GROUP = "socore" +FILE_MODE = 0o640 +PASSWORD_LEN = 72 +# Matches salt/postgres/auth.sls's DIGITS+LOWERCASE+UPPERCASE+SYMBOLS. +PASSWORD_CHARS = ( + string.digits + + string.ascii_lowercase + + string.ascii_uppercase + + "~!@#^&*()-_=+[]|;:,.<>?" +) + + +def safe_minion_id(minion_id): + return minion_id.replace(".", "_").replace("-", "_").lower() + + +def generate_password(): + return "".join(secrets.choice(PASSWORD_CHARS) for _ in range(PASSWORD_LEN)) + + +def load_creds(): + if not os.path.exists(CREDS_PATH): + return {"telegraf": {"postgres_creds": {}}} + with open(CREDS_PATH, "r") as f: + data = yaml.safe_load(f) or {} + if not isinstance(data, dict): + data = {} + data.setdefault("telegraf", {}) + if not isinstance(data["telegraf"], dict): + data["telegraf"] = {} + data["telegraf"].setdefault("postgres_creds", {}) + if not isinstance(data["telegraf"]["postgres_creds"], dict): + data["telegraf"]["postgres_creds"] = {} + return data + + +def atomic_write(data): + os.makedirs(os.path.dirname(CREDS_PATH), exist_ok=True) + fd, tmp_path = tempfile.mkstemp( + prefix=".creds.", suffix=".tmp", dir=os.path.dirname(CREDS_PATH) + ) + try: + with os.fdopen(fd, "w") as f: + yaml.safe_dump(data, f, default_flow_style=False, sort_keys=True) + f.flush() + os.fsync(f.fileno()) + os.chmod(tmp_path, FILE_MODE) + try: + pw = pwd.getpwnam(OWNER_USER) + os.chown(tmp_path, pw.pw_uid, pw.pw_gid) + except KeyError: + pass + os.rename(tmp_path, CREDS_PATH) + except Exception: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + raise + + +def with_lock(fn): + os.makedirs(os.path.dirname(LOCK_PATH), exist_ok=True) + with open(LOCK_PATH, "a+") as lf: + fcntl.flock(lf.fileno(), fcntl.LOCK_EX) + try: + return fn() + finally: + fcntl.flock(lf.fileno(), fcntl.LOCK_UN) + + +def cmd_add(minion_id): + def go(): + data = load_creds() + creds = data["telegraf"]["postgres_creds"] + if minion_id in creds: + return 0 + safe = safe_minion_id(minion_id) + creds[minion_id] = { + "user": "so_telegraf_" + safe, + "pass": generate_password(), + } + atomic_write(data) + return 0 + + return with_lock(go) + + +def cmd_remove(minion_id): + def go(): + data = load_creds() + creds = data["telegraf"]["postgres_creds"] + if minion_id in creds: + del creds[minion_id] + atomic_write(data) + return 0 + + return with_lock(go) + + +def usage(): + print( + "Usage: so-telegraf-cred ", + file=sys.stderr, + ) + return 2 + + +def main(argv): + if len(argv) != 3: + return usage() + op, minion_id = argv[1], argv[2] + if not minion_id: + return usage() + if op == "add": + return cmd_add(minion_id) + if op == "remove": + return cmd_remove(minion_id) + return usage() + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index 1580e83dd..2c727c0f7 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -490,32 +490,16 @@ up_to_3.1.0() { post_to_3.1.0() { /usr/sbin/so-kibana-space-defaults - # One-time backfill for minions that existed before the postgres Telegraf - # feature shipped. postgres.auth's up_minions fallback loop generates any - # missing aggregate pillar entries; postgres.telegraf_users CREATEs the - # matching DB roles; then the bash loop below copies each minion's cred - # into its own pillar file. Going forward, so-minion owns add/delete for - # every new minion, so this backfill is only needed on the upgrade boundary. - echo "Provisioning Telegraf Postgres users for existing minions." - salt-call --local state.apply postgres.auth,postgres.telegraf_users queue=True || true - - AGGREGATE_PILLAR=/opt/so/saltstack/local/pillar/postgres/auth.sls - MINIONS_DIR=/opt/so/saltstack/local/pillar/minions - if [[ -f "$AGGREGATE_PILLAR" && -d "$MINIONS_DIR" ]]; then - for pillar_file in "$MINIONS_DIR"/*.sls; do - [[ -f "$pillar_file" ]] || continue - mid=$(basename "$pillar_file" .sls) - [[ "$mid" == adv_* ]] && continue - safe=$(echo "$mid" | tr '.-' '__' | tr '[:upper:]' '[:lower:]') - existing_user=$(so-yaml.py get -r "$pillar_file" postgres.telegraf.user 2>/dev/null || true) - [[ "$existing_user" == "so_telegraf_${safe}" ]] && continue - user=$(so-yaml.py get -r "$AGGREGATE_PILLAR" "postgres.auth.users.telegraf_${safe}.user" 2>/dev/null || true) - pass=$(so-yaml.py get -r "$AGGREGATE_PILLAR" "postgres.auth.users.telegraf_${safe}.pass" 2>/dev/null || true) - [[ -z "$user" || -z "$pass" ]] && continue - so-yaml.py replace "$pillar_file" postgres.telegraf.user "$user" >/dev/null - so-yaml.py replace "$pillar_file" postgres.telegraf.pass "$pass" >/dev/null - done - fi + # Backfill the Telegraf creds pillar for every accepted minion. so-telegraf-cred + # add is idempotent — it no-ops when an entry already exists — so this is safe + # to run on every soup. The subsequent state.apply creates/updates the matching + # Postgres roles from the reconciled pillar. + echo "Reconciling Telegraf Postgres creds for accepted minions." + for mid in $(salt-key --out=json --list=accepted 2>/dev/null | jq -r '.minions[]?' 2>/dev/null); do + [[ -n "$mid" ]] || continue + /usr/sbin/so-telegraf-cred add "$mid" || echo " warning: so-telegraf-cred add $mid failed" >&2 + done + salt-call --local state.apply postgres.telegraf_users queue=True || true POSTVERSION=3.1.0 } diff --git a/salt/orch/deploy_newnode.sls b/salt/orch/deploy_newnode.sls index c05a812a3..ee241ef33 100644 --- a/salt/orch/deploy_newnode.sls +++ b/salt/orch/deploy_newnode.sls @@ -25,8 +25,33 @@ manager_run_es_soc: - salt: {{NEWNODE}}_update_mine {% endif %} +# so-minion has already added the new minion's entry to telegraf/creds.sls +# via so-telegraf-cred before this orch fires. Reconcile the Postgres role +# on the manager so the new minion can authenticate on its first highstate, +# then refresh the minion's pillar so its telegraf.conf renders with the +# freshly-written cred. +manager_create_postgres_telegraf_role: + salt.state: + - tgt: {{ MANAGER }} + - sls: + - postgres.telegraf_users + - queue: True + - require: + - salt: {{NEWNODE}}_update_mine + +{{NEWNODE}}_refresh_pillar: + salt.function: + - name: saltutil.refresh_pillar + - tgt: {{ NEWNODE }} + - kwarg: + wait: True + - require: + - salt: manager_create_postgres_telegraf_role + {{NEWNODE}}_run_highstate: salt.state: - tgt: {{ NEWNODE }} - highstate: True - queue: True + - require: + - salt: {{NEWNODE}}_refresh_pillar diff --git a/salt/postgres/auth.sls b/salt/postgres/auth.sls index 3da1bcde0..4f486ff02 100644 --- a/salt/postgres/auth.sls +++ b/salt/postgres/auth.sls @@ -13,24 +13,8 @@ {% set CHARS = DIGITS~LOWERCASE~UPPERCASE~SYMBOLS %} {% set so_postgres_user_pass = salt['pillar.get']('postgres:auth:users:so_postgres_user:pass', salt['random.get_str'](72, chars=CHARS)) %} - {# Per-minion Telegraf Postgres credentials. Merge currently-up minions with any #} - {# previously-known entries in pillar so existing passwords persist across runs. #} - {% set existing = salt['pillar.get']('postgres:auth:users', {}) %} - {% set up_minions = salt['saltutil.runner']('manage.up') or [] %} - {% set telegraf_users = {} %} - {% for key, entry in existing.items() %} - {%- if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %} - {%- do telegraf_users.update({key: entry}) %} - {%- endif %} - {% endfor %} - {% for mid in up_minions %} - {%- set safe = mid | replace('.','_') | replace('-','_') | lower %} - {%- set key = 'telegraf_' ~ safe %} - {%- if key not in telegraf_users %} - {%- do telegraf_users.update({key: {'user': 'so_telegraf_' ~ safe, 'pass': salt['random.get_str'](72, chars=CHARS)}}) %} - {%- endif %} - {% endfor %} - +# Admin cred only. Per-minion Telegraf creds live in telegraf/creds.sls, +# managed by /usr/sbin/so-telegraf-cred (called from so-minion). postgres_auth_pillar: file.managed: - name: /opt/so/saltstack/local/pillar/postgres/auth.sls @@ -43,11 +27,6 @@ postgres_auth_pillar: so_postgres_user: user: so_postgres pass: "{{ so_postgres_user_pass }}" - {% for key, entry in telegraf_users.items() %} - {{ key }}: - user: {{ entry.user }} - pass: "{{ entry.pass }}" - {% endfor %} - show_changes: False {% else %} diff --git a/salt/postgres/telegraf_users.sls b/salt/postgres/telegraf_users.sls index dbbc0f03e..62490ea52 100644 --- a/salt/postgres/telegraf_users.sls +++ b/salt/postgres/telegraf_users.sls @@ -10,7 +10,7 @@ {# postgres_wait_ready below requires `docker_container: so-postgres`, which is declared in postgres.enabled. Include it here so state.apply postgres.telegraf_users - on its own (from the reactor orch or from soup) still has that ID in scope. Salt + on its own (e.g. from orch.deploy_newnode) still has that ID in scope. Salt de-duplicates the circular include. #} include: - postgres.enabled @@ -96,9 +96,9 @@ postgres_telegraf_group_role: - require: - cmd: postgres_create_telegraf_db -{% set users = salt['pillar.get']('postgres:auth:users', {}) %} -{% for key, entry in users.items() %} -{% if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %} +{% set creds = salt['pillar.get']('telegraf:postgres_creds', {}) %} +{% for mid, entry in creds.items() %} +{% if entry.get('user') and entry.get('pass') %} {% set u = entry.user %} {% set p = entry.pass | replace("'", "''") %} diff --git a/salt/telegraf/etc/telegraf.conf b/salt/telegraf/etc/telegraf.conf index 53b96e4ab..02d969ff3 100644 --- a/salt/telegraf/etc/telegraf.conf +++ b/salt/telegraf/etc/telegraf.conf @@ -10,12 +10,12 @@ {%- set LOGSTASH_ENABLED = LOGSTASH_MERGED.enabled %} {%- set TG_OUT = TELEGRAFMERGED.output | upper %} {%- set PG_HOST = GLOBALS.manager_ip %} -{#- Per-minion telegraf creds are written into the minion's own pillar file - (/opt/so/saltstack/local/pillar/minions/.sls) by postgres.auth on the - manager. Each minion only sees its own password — the aggregate map in - postgres:auth:users is manager-scoped. #} -{%- set PG_USER = salt['pillar.get']('postgres:telegraf:user', '') %} -{%- set PG_PASS = salt['pillar.get']('postgres:telegraf:pass', '') %} +{#- Per-minion telegraf creds live in the grid-wide telegraf/creds.sls pillar, + written by /usr/sbin/so-telegraf-cred on the manager. Each minion looks up + its own entry by grains.id. #} +{%- set PG_ENTRY = salt['pillar.get']('telegraf:postgres_creds:' ~ grains.id, {}) %} +{%- set PG_USER = PG_ENTRY.get('user', '') %} +{%- set PG_PASS = PG_ENTRY.get('pass', '') %} # Global tags can be specified here in key="value" format. [global_tags] role = "{{ GLOBALS.role.split('-') | last }}"