Add telegraf_output selector for InfluxDB/Postgres dual-write

Introduces global.telegraf_output (INFLUXDB|POSTGRES|BOTH, default BOTH)
so Telegraf can write metrics to Postgres alongside or instead of
InfluxDB. Each minion authenticates with its own so_telegraf_<minion>
role and writes to a matching schema inside a shared so_telegraf
database, keeping blast radius per-credential to that minion's data.

- Per-minion credentials auto-generated and persisted in postgres/auth.sls
- postgres/telegraf_users.sls reconciles roles/schemas on every apply
- Firewall opens 5432 only to minion hostgroups when Postgres output is active
- Reactor on salt/auth + orch/telegraf_postgres_sync.sls provision new
  minions automatically on key accept
- soup post_to_3.1.0 backfills users for existing minions on upgrade
- so-show-stats prints latest CPU/mem/disk/load per minion for sanity checks
- so-telegraf-trim + nightly cron prune rows older than
  postgres.telegraf.retention_days (default 14)
This commit is contained in:
Mike Reeves
2026-04-15 14:32:10 -04:00
parent 9ccd0acb4f
commit cefbe01333
17 changed files with 440 additions and 2 deletions
+23
View File
@@ -13,6 +13,24 @@
{% set CHARS = DIGITS~LOWERCASE~UPPERCASE~SYMBOLS %}
{% set so_postgres_user_pass = salt['pillar.get']('postgres:auth:users:so_postgres_user:pass', salt['random.get_str'](72, chars=CHARS)) %}
{# Per-minion Telegraf Postgres credentials. Merge currently-up minions with any #}
{# previously-known entries in pillar so existing passwords persist across runs. #}
{% set existing = salt['pillar.get']('postgres:auth:users', {}) %}
{% set up_minions = salt['saltutil.runner']('manage.up') or [] %}
{% set telegraf_users = {} %}
{% for key, entry in existing.items() %}
{%- if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %}
{%- do telegraf_users.update({key: entry}) %}
{%- endif %}
{% endfor %}
{% for mid in up_minions %}
{%- set safe = mid | replace('.','_') | replace('-','_') | lower %}
{%- set key = 'telegraf_' ~ safe %}
{%- if key not in telegraf_users %}
{%- do telegraf_users.update({key: {'user': 'so_telegraf_' ~ safe, 'pass': salt['random.get_str'](72, chars=CHARS)}}) %}
{%- endif %}
{% endfor %}
postgres_auth_pillar:
file.managed:
- name: /opt/so/saltstack/local/pillar/postgres/auth.sls
@@ -25,6 +43,11 @@ postgres_auth_pillar:
so_postgres_user:
user: so_postgres
pass: "{{ so_postgres_user_pass }}"
{% for key, entry in telegraf_users.items() %}
{{ key }}:
user: {{ entry.user }}
pass: "{{ entry.pass }}"
{% endfor %}
- show_changes: False
{% else %}
+2
View File
@@ -1,5 +1,7 @@
postgres:
enabled: True
telegraf:
retention_days: 14
config:
listen_addresses: '*'
port: 5432
+16
View File
@@ -16,6 +16,7 @@ include:
- postgres.ssl
- postgres.config
- postgres.sostatus
- postgres.telegraf_users
so-postgres:
docker_container.running:
@@ -79,6 +80,21 @@ delete_so-postgres_so-status.disabled:
- name: /opt/so/conf/so-status/so-status.conf
- regex: ^so-postgres$
so_telegraf_trim:
{% if GLOBALS.telegraf_output in ['POSTGRES', 'BOTH'] %}
cron.present:
{% else %}
cron.absent:
{% endif %}
- name: /usr/sbin/so-telegraf-trim >> /opt/so/log/postgres/telegraf-trim.log 2>&1
- identifier: so_telegraf_trim
- user: root
- minute: '17'
- hour: '3'
- daymonth: '*'
- month: '*'
- dayweek: '*'
{% else %}
{{sls}}_state_not_allowed:
+8
View File
@@ -16,3 +16,11 @@ psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-E
\$\$;
GRANT ALL PRIVILEGES ON DATABASE "$POSTGRES_DB" TO "$SO_POSTGRES_USER";
EOSQL
# Bootstrap the Telegraf metrics database. Per-minion roles + schemas are
# reconciled on every state.apply by postgres/telegraf_users.sls; this block
# only ensures the shared database exists on first initialization.
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL
SELECT 'CREATE DATABASE so_telegraf'
WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec
EOSQL
+7
View File
@@ -0,0 +1,7 @@
postgres:
telegraf:
retention_days:
description: Number of days of Telegraf metrics to keep in the so_telegraf database. Older rows are deleted nightly by so-telegraf-trim.
forcedType: int
advanced: True
helpLink: influxdb
+49
View File
@@ -0,0 +1,49 @@
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
{% from 'allowed_states.map.jinja' import allowed_states %}
{% if sls.split('.')[0] in allowed_states %}
{% from 'vars/globals.map.jinja' import GLOBALS %}
{% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %}
{% if TG_OUT in ['POSTGRES', 'BOTH'] %}
{% set users = salt['pillar.get']('postgres:auth:users', {}) %}
{% for key, entry in users.items() %}
{% if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %}
{% set u = entry.user %}
{% set p = entry.pass | replace("'", "''") %}
postgres_telegraf_role_{{ u }}:
cmd.run:
- name: |
docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{{ u }}') THEN
EXECUTE format('CREATE ROLE %I WITH LOGIN PASSWORD %L', '{{ u }}', '{{ p }}');
ELSE
EXECUTE format('ALTER ROLE %I WITH PASSWORD %L', '{{ u }}', '{{ p }}');
END IF;
END
$$;
GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}";
CREATE SCHEMA IF NOT EXISTS "{{ u }}" AUTHORIZATION "{{ u }}";
EOSQL
- require:
- docker_container: so-postgres
{% endif %}
{% endfor %}
{% endif %}
{% else %}
{{sls}}_state_not_allowed:
test.fail_without_changes:
- name: {{sls}}_state_not_allowed
{% endif %}
+110
View File
@@ -0,0 +1,110 @@
#!/bin/bash
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
# Point-in-time host metrics from the Telegraf Postgres backend.
# Sanity-check tool for verifying metrics are landing before the grid
# dashboards consume them.
. /usr/sbin/so-common
usage() {
cat <<EOF
Usage: $0 [minion_id]
Shows the most recent CPU, memory, disk, and load metrics for each minion
from the so_telegraf Postgres database. Without an argument, reports on
every minion that has data. With a minion_id, limits output to that one.
Requires: sudo, so-postgres running, global.telegraf_output set to
POSTGRES or BOTH.
EOF
exit 1
}
if [ "$(id -u)" -ne 0 ]; then
echo "This script must be run using sudo!"
exit 1
fi
case "${1:-}" in
-h|--help) usage ;;
esac
FILTER_MINION="${1:-}"
so_psql() {
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
}
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?"
exit 2
fi
# List telegraf schemas (role-per-minion naming convention: so_telegraf_<sanitized_minion_id>)
SCHEMAS=$(so_psql -c "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE 'so_telegraf_%' ORDER BY schema_name;")
if [ -z "$SCHEMAS" ]; then
echo "No minion schemas found in so_telegraf."
exit 0
fi
print_metric() {
local schema="$1" table="$2" query="$3"
# Confirm table exists in this schema before querying
local exists
exists=$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${schema}' AND table_name='${table}' LIMIT 1;")
[ -z "$exists" ] && return 0
so_psql -c "$query"
}
for schema in $SCHEMAS; do
minion="${schema#so_telegraf_}"
if [ -n "$FILTER_MINION" ]; then
# Compare against the sanitized form used in schema names
want=$(echo "$FILTER_MINION" | tr '.-' '_' | tr '[:upper:]' '[:lower:]')
[ "$minion" != "$want" ] && continue
fi
echo "===================================================================="
echo " Minion: $minion"
echo "===================================================================="
print_metric "$schema" "cpu" "
SELECT 'cpu ' AS metric,
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
round((100 - usage_idle)::numeric, 1) || '% used'
FROM \"${schema}\".cpu
WHERE cpu = 'cpu-total'
ORDER BY time DESC LIMIT 1;"
print_metric "$schema" "mem" "
SELECT 'memory ' AS metric,
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
round(used_percent::numeric, 1) || '% used (' ||
pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')'
FROM \"${schema}\".mem
ORDER BY time DESC LIMIT 1;"
print_metric "$schema" "disk" "
SELECT 'disk ' || rpad(path, 8) AS metric,
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
round(used_percent::numeric, 1) || '% used (' ||
pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')'
FROM \"${schema}\".disk
WHERE time = (SELECT max(time) FROM \"${schema}\".disk)
ORDER BY path;"
print_metric "$schema" "system" "
SELECT 'load ' AS metric,
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
load1 || ' / ' || load5 || ' / ' || load15 || ' (1/5/15m)'
FROM \"${schema}\".system
ORDER BY time DESC LIMIT 1;"
echo ""
done
+103
View File
@@ -0,0 +1,103 @@
#!/bin/bash
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
# Deletes Telegraf metric rows older than the configured retention window from
# every minion schema in the so_telegraf database. Intended to run daily from
# cron. Retention comes from pillar (postgres.telegraf.retention_days),
# defaulting to 14 days. An explicit --days argument overrides the pillar.
. /usr/sbin/so-common
usage() {
cat <<EOF
Usage: $0 [--days N] [--dry-run]
--days N Override retention in days (default: pillar
postgres.telegraf.retention_days, fallback 14)
--dry-run Report what would be deleted without modifying anything
EOF
exit 1
}
if [ "$(id -u)" -ne 0 ]; then
echo "This script must be run using sudo!"
exit 1
fi
DAYS=""
DRY_RUN=0
while [ $# -gt 0 ]; do
case "$1" in
--days) DAYS="$2"; shift 2 ;;
--dry-run) DRY_RUN=1; shift ;;
-h|--help) usage ;;
*) usage ;;
esac
done
if [ -z "$DAYS" ]; then
DAYS=$(salt-call --local --out=newline_values_only pillar.get postgres:telegraf:retention_days 2>/dev/null)
fi
if ! [[ "$DAYS" =~ ^[0-9]+$ ]] || [ "$DAYS" -lt 1 ]; then
DAYS=14
fi
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') so-telegraf-trim: $*"
}
so_psql() {
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
}
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
log "Database so_telegraf not present; nothing to trim."
exit 0
fi
log "Trimming rows older than ${DAYS} days (dry_run=${DRY_RUN})."
TOTAL_DELETED=0
# One row per (schema, table) we might want to trim.
# Column name is 'time' for all telegraf output plugin tables; skip metadata
# tables (tag_* used for tags_as_foreign_keys).
ROWS=$(so_psql -c "
SELECT table_schema || '.' || table_name
FROM information_schema.columns
WHERE column_name = 'time'
AND data_type IN ('timestamp with time zone', 'timestamp without time zone')
AND table_schema LIKE 'so_telegraf_%'
ORDER BY 1;")
if [ -z "$ROWS" ]; then
log "No telegraf metric tables found."
exit 0
fi
for qualified in $ROWS; do
if [ "$DRY_RUN" -eq 1 ]; then
count=$(so_psql -c "SELECT count(*) FROM \"${qualified%.*}\".\"${qualified#*.}\" WHERE time < now() - interval '${DAYS} days';")
log "would delete ${count:-0} rows from ${qualified}"
else
# RETURNING count via a CTE so we can log how much was trimmed per table
deleted=$(so_psql -c "
WITH d AS (
DELETE FROM \"${qualified%.*}\".\"${qualified#*.}\"
WHERE time < now() - interval '${DAYS} days'
RETURNING 1
)
SELECT count(*) FROM d;")
deleted=${deleted:-0}
TOTAL_DELETED=$((TOTAL_DELETED + deleted))
[ "$deleted" -gt 0 ] && log "deleted ${deleted} rows from ${qualified}"
fi
done
if [ "$DRY_RUN" -eq 0 ]; then
log "Trim complete. Total rows deleted: ${TOTAL_DELETED}."
fi