From 9ccd0acb4f74bf724534d01a57f0afcc36d4dc44 Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Fri, 10 Apr 2026 11:41:33 -0400 Subject: [PATCH 1/5] Add ES credentials to postgres module config for migration Postgres module now queries Elasticsearch directly via HTTP for the chat migration (bypasses RBAC that needs user context). Pass esHostUrl, esUsername, esPassword alongside postgres creds. --- salt/soc/defaults.map.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/salt/soc/defaults.map.jinja b/salt/soc/defaults.map.jinja index d99cf57f7..46ae7e8fd 100644 --- a/salt/soc/defaults.map.jinja +++ b/salt/soc/defaults.map.jinja @@ -26,7 +26,7 @@ {% if GLOBALS.postgres is defined and GLOBALS.postgres.auth is defined %} {% set PG_ADMIN_PASS = salt['pillar.get']('secrets:postgres_pass', '') %} -{% do SOCDEFAULTS.soc.config.server.modules.update({'postgres': {'hostUrl': GLOBALS.manager_ip, 'port': 5432, 'username': GLOBALS.postgres.auth.users.so_postgres_user.user, 'password': GLOBALS.postgres.auth.users.so_postgres_user.pass, 'adminUser': 'postgres', 'adminPassword': PG_ADMIN_PASS, 'dbname': 'securityonion', 'sslMode': 'require', 'assistantEnabled': true}}) %} +{% do SOCDEFAULTS.soc.config.server.modules.update({'postgres': {'hostUrl': GLOBALS.manager_ip, 'port': 5432, 'username': GLOBALS.postgres.auth.users.so_postgres_user.user, 'password': GLOBALS.postgres.auth.users.so_postgres_user.pass, 'adminUser': 'postgres', 'adminPassword': PG_ADMIN_PASS, 'dbname': 'securityonion', 'sslMode': 'require', 'assistantEnabled': true, 'esHostUrl': 'https://' ~ GLOBALS.manager_ip ~ ':9200', 'esUsername': GLOBALS.elasticsearch.auth.users.so_elastic_user.user, 'esPassword': GLOBALS.elasticsearch.auth.users.so_elastic_user.pass}}) %} {% endif %} {% do SOCDEFAULTS.soc.config.server.modules.influxdb.update({'hostUrl': 'https://' ~ GLOBALS.influxdb_host ~ ':8086'}) %} From cefbe01333ee9a1894aa623fdc7b5aa928b507ef Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Wed, 15 Apr 2026 14:32:10 -0400 Subject: [PATCH 2/5] Add telegraf_output selector for InfluxDB/Postgres dual-write Introduces global.telegraf_output (INFLUXDB|POSTGRES|BOTH, default BOTH) so Telegraf can write metrics to Postgres alongside or instead of InfluxDB. Each minion authenticates with its own so_telegraf_ role and writes to a matching schema inside a shared so_telegraf database, keeping blast radius per-credential to that minion's data. - Per-minion credentials auto-generated and persisted in postgres/auth.sls - postgres/telegraf_users.sls reconciles roles/schemas on every apply - Firewall opens 5432 only to minion hostgroups when Postgres output is active - Reactor on salt/auth + orch/telegraf_postgres_sync.sls provision new minions automatically on key accept - soup post_to_3.1.0 backfills users for existing minions on upgrade - so-show-stats prints latest CPU/mem/disk/load per minion for sanity checks - so-telegraf-trim + nightly cron prune rows older than postgres.telegraf.retention_days (default 14) --- salt/firewall/map.jinja | 12 +++ salt/global/defaults.yaml | 3 +- salt/global/soc_global.yaml | 11 +++ salt/manager/tools/sbin/soup | 25 ++++- salt/orch/telegraf_postgres_sync.sls | 26 +++++ salt/postgres/auth.sls | 23 +++++ salt/postgres/defaults.yaml | 2 + salt/postgres/enabled.sls | 16 ++++ salt/postgres/files/init-users.sh | 8 ++ salt/postgres/soc_postgres.yaml | 7 ++ salt/postgres/telegraf_users.sls | 49 ++++++++++ salt/postgres/tools/sbin/so-show-stats | 110 ++++++++++++++++++++++ salt/postgres/tools/sbin/so-telegraf-trim | 103 ++++++++++++++++++++ salt/reactor/telegraf_user_sync.sls | 18 ++++ salt/salt/master.sls | 13 +++ salt/telegraf/etc/telegraf.conf | 15 +++ salt/vars/globals.map.jinja | 1 + 17 files changed, 440 insertions(+), 2 deletions(-) create mode 100644 salt/orch/telegraf_postgres_sync.sls create mode 100644 salt/postgres/soc_postgres.yaml create mode 100644 salt/postgres/telegraf_users.sls create mode 100644 salt/postgres/tools/sbin/so-show-stats create mode 100644 salt/postgres/tools/sbin/so-telegraf-trim create mode 100644 salt/reactor/telegraf_user_sync.sls diff --git a/salt/firewall/map.jinja b/salt/firewall/map.jinja index 58d8c189d..2821f62b4 100644 --- a/salt/firewall/map.jinja +++ b/salt/firewall/map.jinja @@ -55,4 +55,16 @@ {% endif %} +{# Open Postgres (5432) to minion hostgroups when Telegraf is configured to write to Postgres #} +{% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %} +{% if TG_OUT in ['POSTGRES', 'BOTH'] %} +{% if role.startswith('manager') or role == 'standalone' or role == 'eval' %} +{% for r in ['sensor', 'searchnode', 'heavynode', 'receiver', 'fleet', 'idh', 'desktop', 'import'] %} +{% if FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r] is defined %} +{% do FIREWALL_DEFAULT.firewall.role[role].chain["DOCKER-USER"].hostgroups[r].portgroups.append('postgres') %} +{% endif %} +{% endfor %} +{% endif %} +{% endif %} + {% set FIREWALL_MERGED = salt['pillar.get']('firewall', FIREWALL_DEFAULT.firewall, merge=True) %} diff --git a/salt/global/defaults.yaml b/salt/global/defaults.yaml index 92b9c1c1a..d041306a7 100644 --- a/salt/global/defaults.yaml +++ b/salt/global/defaults.yaml @@ -1,3 +1,4 @@ global: pcapengine: SURICATA - pipeline: REDIS \ No newline at end of file + pipeline: REDIS + telegraf_output: BOTH \ No newline at end of file diff --git a/salt/global/soc_global.yaml b/salt/global/soc_global.yaml index 33abbf690..a01d33cb8 100644 --- a/salt/global/soc_global.yaml +++ b/salt/global/soc_global.yaml @@ -65,4 +65,15 @@ global: description: Allows use of Endgame with Security Onion. This feature requires a license from Endgame. global: True advanced: True + telegraf_output: + description: Selects the backend(s) Telegraf writes metrics to. INFLUXDB keeps the current behavior; POSTGRES writes to the grid's Postgres instance; BOTH dual-writes for migration validation. + regex: ^(INFLUXDB|POSTGRES|BOTH)$ + options: + - INFLUXDB + - POSTGRES + - BOTH + regexFailureMessage: You must enter INFLUXDB, POSTGRES, or BOTH. + global: True + advanced: True + helpLink: influxdb diff --git a/salt/manager/tools/sbin/soup b/salt/manager/tools/sbin/soup index d25153863..d5ade0fab 100755 --- a/salt/manager/tools/sbin/soup +++ b/salt/manager/tools/sbin/soup @@ -362,7 +362,8 @@ preupgrade_changes() { # This function is to add any new pillar items if needed. echo "Checking to see if changes are needed." - [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0 + [[ "$INSTALLEDVERSION" =~ ^2\.4\.21[0-9]+$ ]] && up_to_3.0.0 + [[ "$INSTALLEDVERSION" =~ ^3\.0\.[0-9]+$ ]] && up_to_3.1.0 true } @@ -371,6 +372,7 @@ postupgrade_changes() { echo "Running post upgrade processes." [[ "$POSTVERSION" =~ ^2\.4\.21[0-9]+$ ]] && post_to_3.0.0 + [[ "$POSTVERSION" =~ ^3\.0\.[0-9]+$ ]] && post_to_3.1.0 true } @@ -469,6 +471,27 @@ post_to_3.0.0() { ### 3.0.0 End ### +### 3.1.0 Start ### + +up_to_3.1.0() { + INSTALLEDVERSION=3.1.0 +} + +post_to_3.1.0() { + # Provision per-minion Telegraf Postgres users for every minion known to the + # manager. postgres.auth iterates manage.up to generate any missing passwords; + # postgres.telegraf_users reconciles the roles and schemas inside the so-postgres + # container. Then push a telegraf state to every minion so their telegraf.conf + # picks up the new credentials on the first apply after soup. + echo "Provisioning Telegraf Postgres users for existing minions." + salt-call --local state.apply postgres.auth postgres.telegraf_users || true + salt '*' state.sls telegraf || true + + POSTVERSION=3.1.0 +} + +### 3.1.0 End ### + repo_sync() { echo "Sync the local repo." su socore -c '/usr/sbin/so-repo-sync' || fail "Unable to complete so-repo-sync." diff --git a/salt/orch/telegraf_postgres_sync.sls b/salt/orch/telegraf_postgres_sync.sls new file mode 100644 index 000000000..90c42fc07 --- /dev/null +++ b/salt/orch/telegraf_postgres_sync.sls @@ -0,0 +1,26 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +{% set MINION = salt['pillar.get']('minion_id') %} +{% set MANAGER = salt['pillar.get']('setup:manager') or salt['grains.get']('master') %} + +manager_sync_telegraf_pg_users: + salt.state: + - tgt: {{ MANAGER }} + - sls: + - postgres.auth + - postgres.telegraf_users + - queue: True + +{% if MINION and MINION != MANAGER %} +{{ MINION }}_apply_telegraf: + salt.state: + - tgt: {{ MINION }} + - sls: + - telegraf + - queue: True + - require: + - salt: manager_sync_telegraf_pg_users +{% endif %} diff --git a/salt/postgres/auth.sls b/salt/postgres/auth.sls index a19b2341a..3da1bcde0 100644 --- a/salt/postgres/auth.sls +++ b/salt/postgres/auth.sls @@ -13,6 +13,24 @@ {% set CHARS = DIGITS~LOWERCASE~UPPERCASE~SYMBOLS %} {% set so_postgres_user_pass = salt['pillar.get']('postgres:auth:users:so_postgres_user:pass', salt['random.get_str'](72, chars=CHARS)) %} + {# Per-minion Telegraf Postgres credentials. Merge currently-up minions with any #} + {# previously-known entries in pillar so existing passwords persist across runs. #} + {% set existing = salt['pillar.get']('postgres:auth:users', {}) %} + {% set up_minions = salt['saltutil.runner']('manage.up') or [] %} + {% set telegraf_users = {} %} + {% for key, entry in existing.items() %} + {%- if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %} + {%- do telegraf_users.update({key: entry}) %} + {%- endif %} + {% endfor %} + {% for mid in up_minions %} + {%- set safe = mid | replace('.','_') | replace('-','_') | lower %} + {%- set key = 'telegraf_' ~ safe %} + {%- if key not in telegraf_users %} + {%- do telegraf_users.update({key: {'user': 'so_telegraf_' ~ safe, 'pass': salt['random.get_str'](72, chars=CHARS)}}) %} + {%- endif %} + {% endfor %} + postgres_auth_pillar: file.managed: - name: /opt/so/saltstack/local/pillar/postgres/auth.sls @@ -25,6 +43,11 @@ postgres_auth_pillar: so_postgres_user: user: so_postgres pass: "{{ so_postgres_user_pass }}" + {% for key, entry in telegraf_users.items() %} + {{ key }}: + user: {{ entry.user }} + pass: "{{ entry.pass }}" + {% endfor %} - show_changes: False {% else %} diff --git a/salt/postgres/defaults.yaml b/salt/postgres/defaults.yaml index c24a07f56..dd7994044 100644 --- a/salt/postgres/defaults.yaml +++ b/salt/postgres/defaults.yaml @@ -1,5 +1,7 @@ postgres: enabled: True + telegraf: + retention_days: 14 config: listen_addresses: '*' port: 5432 diff --git a/salt/postgres/enabled.sls b/salt/postgres/enabled.sls index c103245ea..24e348365 100644 --- a/salt/postgres/enabled.sls +++ b/salt/postgres/enabled.sls @@ -16,6 +16,7 @@ include: - postgres.ssl - postgres.config - postgres.sostatus + - postgres.telegraf_users so-postgres: docker_container.running: @@ -79,6 +80,21 @@ delete_so-postgres_so-status.disabled: - name: /opt/so/conf/so-status/so-status.conf - regex: ^so-postgres$ +so_telegraf_trim: +{% if GLOBALS.telegraf_output in ['POSTGRES', 'BOTH'] %} + cron.present: +{% else %} + cron.absent: +{% endif %} + - name: /usr/sbin/so-telegraf-trim >> /opt/so/log/postgres/telegraf-trim.log 2>&1 + - identifier: so_telegraf_trim + - user: root + - minute: '17' + - hour: '3' + - daymonth: '*' + - month: '*' + - dayweek: '*' + {% else %} {{sls}}_state_not_allowed: diff --git a/salt/postgres/files/init-users.sh b/salt/postgres/files/init-users.sh index 7451e0bf8..b07dfcdb0 100644 --- a/salt/postgres/files/init-users.sh +++ b/salt/postgres/files/init-users.sh @@ -16,3 +16,11 @@ psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-E \$\$; GRANT ALL PRIVILEGES ON DATABASE "$POSTGRES_DB" TO "$SO_POSTGRES_USER"; EOSQL + +# Bootstrap the Telegraf metrics database. Per-minion roles + schemas are +# reconciled on every state.apply by postgres/telegraf_users.sls; this block +# only ensures the shared database exists on first initialization. +psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL + SELECT 'CREATE DATABASE so_telegraf' + WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec +EOSQL diff --git a/salt/postgres/soc_postgres.yaml b/salt/postgres/soc_postgres.yaml new file mode 100644 index 000000000..167772e3f --- /dev/null +++ b/salt/postgres/soc_postgres.yaml @@ -0,0 +1,7 @@ +postgres: + telegraf: + retention_days: + description: Number of days of Telegraf metrics to keep in the so_telegraf database. Older rows are deleted nightly by so-telegraf-trim. + forcedType: int + advanced: True + helpLink: influxdb diff --git a/salt/postgres/telegraf_users.sls b/salt/postgres/telegraf_users.sls new file mode 100644 index 000000000..d510af9e5 --- /dev/null +++ b/salt/postgres/telegraf_users.sls @@ -0,0 +1,49 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +{% from 'allowed_states.map.jinja' import allowed_states %} +{% if sls.split('.')[0] in allowed_states %} +{% from 'vars/globals.map.jinja' import GLOBALS %} + +{% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %} +{% if TG_OUT in ['POSTGRES', 'BOTH'] %} + +{% set users = salt['pillar.get']('postgres:auth:users', {}) %} +{% for key, entry in users.items() %} +{% if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %} +{% set u = entry.user %} +{% set p = entry.pass | replace("'", "''") %} + +postgres_telegraf_role_{{ u }}: + cmd.run: + - name: | + docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL' + DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{{ u }}') THEN + EXECUTE format('CREATE ROLE %I WITH LOGIN PASSWORD %L', '{{ u }}', '{{ p }}'); + ELSE + EXECUTE format('ALTER ROLE %I WITH PASSWORD %L', '{{ u }}', '{{ p }}'); + END IF; + END + $$; + GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}"; + CREATE SCHEMA IF NOT EXISTS "{{ u }}" AUTHORIZATION "{{ u }}"; + EOSQL + - require: + - docker_container: so-postgres + +{% endif %} +{% endfor %} + +{% endif %} + +{% else %} + +{{sls}}_state_not_allowed: + test.fail_without_changes: + - name: {{sls}}_state_not_allowed + +{% endif %} diff --git a/salt/postgres/tools/sbin/so-show-stats b/salt/postgres/tools/sbin/so-show-stats new file mode 100644 index 000000000..a512ffb0c --- /dev/null +++ b/salt/postgres/tools/sbin/so-show-stats @@ -0,0 +1,110 @@ +#!/bin/bash + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Point-in-time host metrics from the Telegraf Postgres backend. +# Sanity-check tool for verifying metrics are landing before the grid +# dashboards consume them. + +. /usr/sbin/so-common + +usage() { + cat </dev/null | cut -d\| -f1 | grep -qw so_telegraf; then + echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?" + exit 2 +fi + +# List telegraf schemas (role-per-minion naming convention: so_telegraf_) +SCHEMAS=$(so_psql -c "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE 'so_telegraf_%' ORDER BY schema_name;") + +if [ -z "$SCHEMAS" ]; then + echo "No minion schemas found in so_telegraf." + exit 0 +fi + +print_metric() { + local schema="$1" table="$2" query="$3" + # Confirm table exists in this schema before querying + local exists + exists=$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${schema}' AND table_name='${table}' LIMIT 1;") + [ -z "$exists" ] && return 0 + so_psql -c "$query" +} + +for schema in $SCHEMAS; do + minion="${schema#so_telegraf_}" + if [ -n "$FILTER_MINION" ]; then + # Compare against the sanitized form used in schema names + want=$(echo "$FILTER_MINION" | tr '.-' '_' | tr '[:upper:]' '[:lower:]') + [ "$minion" != "$want" ] && continue + fi + + echo "====================================================================" + echo " Minion: $minion" + echo "====================================================================" + + print_metric "$schema" "cpu" " + SELECT 'cpu ' AS metric, + to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round((100 - usage_idle)::numeric, 1) || '% used' + FROM \"${schema}\".cpu + WHERE cpu = 'cpu-total' + ORDER BY time DESC LIMIT 1;" + + print_metric "$schema" "mem" " + SELECT 'memory ' AS metric, + to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round(used_percent::numeric, 1) || '% used (' || + pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')' + FROM \"${schema}\".mem + ORDER BY time DESC LIMIT 1;" + + print_metric "$schema" "disk" " + SELECT 'disk ' || rpad(path, 8) AS metric, + to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round(used_percent::numeric, 1) || '% used (' || + pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')' + FROM \"${schema}\".disk + WHERE time = (SELECT max(time) FROM \"${schema}\".disk) + ORDER BY path;" + + print_metric "$schema" "system" " + SELECT 'load ' AS metric, + to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + load1 || ' / ' || load5 || ' / ' || load15 || ' (1/5/15m)' + FROM \"${schema}\".system + ORDER BY time DESC LIMIT 1;" + + echo "" +done diff --git a/salt/postgres/tools/sbin/so-telegraf-trim b/salt/postgres/tools/sbin/so-telegraf-trim new file mode 100644 index 000000000..0bf53c1d8 --- /dev/null +++ b/salt/postgres/tools/sbin/so-telegraf-trim @@ -0,0 +1,103 @@ +#!/bin/bash + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Deletes Telegraf metric rows older than the configured retention window from +# every minion schema in the so_telegraf database. Intended to run daily from +# cron. Retention comes from pillar (postgres.telegraf.retention_days), +# defaulting to 14 days. An explicit --days argument overrides the pillar. + +. /usr/sbin/so-common + +usage() { + cat </dev/null) +fi +if ! [[ "$DAYS" =~ ^[0-9]+$ ]] || [ "$DAYS" -lt 1 ]; then + DAYS=14 +fi + +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') so-telegraf-trim: $*" +} + +so_psql() { + docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@" +} + +if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then + log "Database so_telegraf not present; nothing to trim." + exit 0 +fi + +log "Trimming rows older than ${DAYS} days (dry_run=${DRY_RUN})." + +TOTAL_DELETED=0 + +# One row per (schema, table) we might want to trim. +# Column name is 'time' for all telegraf output plugin tables; skip metadata +# tables (tag_* used for tags_as_foreign_keys). +ROWS=$(so_psql -c " + SELECT table_schema || '.' || table_name + FROM information_schema.columns + WHERE column_name = 'time' + AND data_type IN ('timestamp with time zone', 'timestamp without time zone') + AND table_schema LIKE 'so_telegraf_%' + ORDER BY 1;") + +if [ -z "$ROWS" ]; then + log "No telegraf metric tables found." + exit 0 +fi + +for qualified in $ROWS; do + if [ "$DRY_RUN" -eq 1 ]; then + count=$(so_psql -c "SELECT count(*) FROM \"${qualified%.*}\".\"${qualified#*.}\" WHERE time < now() - interval '${DAYS} days';") + log "would delete ${count:-0} rows from ${qualified}" + else + # RETURNING count via a CTE so we can log how much was trimmed per table + deleted=$(so_psql -c " + WITH d AS ( + DELETE FROM \"${qualified%.*}\".\"${qualified#*.}\" + WHERE time < now() - interval '${DAYS} days' + RETURNING 1 + ) + SELECT count(*) FROM d;") + deleted=${deleted:-0} + TOTAL_DELETED=$((TOTAL_DELETED + deleted)) + [ "$deleted" -gt 0 ] && log "deleted ${deleted} rows from ${qualified}" + fi +done + +if [ "$DRY_RUN" -eq 0 ]; then + log "Trim complete. Total rows deleted: ${TOTAL_DELETED}." +fi diff --git a/salt/reactor/telegraf_user_sync.sls b/salt/reactor/telegraf_user_sync.sls new file mode 100644 index 000000000..abf35d3b2 --- /dev/null +++ b/salt/reactor/telegraf_user_sync.sls @@ -0,0 +1,18 @@ +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +{# Fires on salt/auth. Only act on accepted keys — ignore pending/reject. #} +{% if data.get('act') == 'accept' and data.get('id') %} + +{{ data['id'] }}_telegraf_pg_sync: + runner.state.orchestrate: + - args: + - mods: orch.telegraf_postgres_sync + - pillar: + minion_id: {{ data['id'] }} + +{% do salt.log.info('telegraf_user_sync reactor: syncing telegraf PG user for minion %s' % data['id']) %} + +{% endif %} diff --git a/salt/salt/master.sls b/salt/salt/master.sls index 895150cd7..7e3e48074 100644 --- a/salt/salt/master.sls +++ b/salt/salt/master.sls @@ -62,6 +62,19 @@ engines_config: - name: /etc/salt/master.d/engines.conf - source: salt://salt/files/engines.conf +reactor_config_telegraf: + file.managed: + - name: /etc/salt/master.d/reactor_telegraf.conf + - contents: | + reactor: + - 'salt/auth': + - /opt/so/saltstack/default/salt/reactor/telegraf_user_sync.sls + - user: root + - group: root + - mode: 644 + - watch_in: + - service: salt_master_service + # update the bootstrap script when used for salt-cloud salt_bootstrap_cloud: file.managed: diff --git a/salt/telegraf/etc/telegraf.conf b/salt/telegraf/etc/telegraf.conf index aafcf6d77..4cdd81f20 100644 --- a/salt/telegraf/etc/telegraf.conf +++ b/salt/telegraf/etc/telegraf.conf @@ -8,6 +8,11 @@ {%- set ZEEK_ENABLED = salt['pillar.get']('zeek:enabled', True) %} {%- set MDENGINE = GLOBALS.md_engine %} {%- set LOGSTASH_ENABLED = LOGSTASH_MERGED.enabled %} +{%- set TG_OUT = GLOBALS.telegraf_output | upper %} +{%- set PG_HOST = GLOBALS.manager_ip %} +{%- set PG_SAFE = GLOBALS.minion_id | replace('.','_') | replace('-','_') | lower %} +{%- set PG_USER = 'so_telegraf_' ~ PG_SAFE %} +{%- set PG_PASS = salt['pillar.get']('postgres:auth:users:telegraf_' ~ PG_SAFE ~ ':pass', '') %} # Global tags can be specified here in key="value" format. [global_tags] role = "{{ GLOBALS.role.split('-') | last }}" @@ -72,6 +77,7 @@ # OUTPUT PLUGINS # ############################################################################### +{%- if TG_OUT in ['INFLUXDB', 'BOTH'] %} # Configuration for sending metrics to InfluxDB [[outputs.influxdb_v2]] urls = ["https://{{ INFLUXDBHOST }}:8086"] @@ -85,6 +91,15 @@ tls_key = "/etc/telegraf/telegraf.key" ## Use TLS but skip chain & host verification # insecure_skip_verify = false +{%- endif %} + +{%- if TG_OUT in ['POSTGRES', 'BOTH'] %} +# Configuration for sending metrics to PostgreSQL +[[outputs.postgresql]] + connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt" + schema = "{{ PG_USER }}" + tags_as_foreign_keys = true +{%- endif %} ############################################################################### # PROCESSOR PLUGINS # diff --git a/salt/vars/globals.map.jinja b/salt/vars/globals.map.jinja index 385db02ae..787691b13 100644 --- a/salt/vars/globals.map.jinja +++ b/salt/vars/globals.map.jinja @@ -24,6 +24,7 @@ 'md_engine': INIT.PILLAR.global.mdengine, 'pcap_engine': GLOBALMERGED.pcapengine, 'pipeline': GLOBALMERGED.pipeline, + 'telegraf_output': GLOBALMERGED.telegraf_output, 'so_version': INIT.PILLAR.global.soversion, 'so_docker_gateway': DOCKERMERGED.gateway, 'so_docker_range': DOCKERMERGED.range, From d24808ff9854376f5e842312e6d69c27c572d39f Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Wed, 15 Apr 2026 19:28:10 -0400 Subject: [PATCH 3/5] Fix so-show-stats tag column resolution Telegraf's postgresql output stores tag values either as individual columns on _tag or as a single JSONB 'tags' column, depending on plugin version. Introspect information_schema.columns and build the right accessor per tag instead of assuming one layout. --- salt/postgres/tools/sbin/so-show-stats | 83 ++++++++++++++++++-------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/salt/postgres/tools/sbin/so-show-stats b/salt/postgres/tools/sbin/so-show-stats index a512ffb0c..68fd52d00 100644 --- a/salt/postgres/tools/sbin/so-show-stats +++ b/salt/postgres/tools/sbin/so-show-stats @@ -62,6 +62,33 @@ print_metric() { so_psql -c "$query" } +# Telegraf's postgresql output stores tag values either as individual columns +# on the _tag table or as a single JSONB "tags" column, depending on +# plugin version. Returns a SQL expression that extracts the named tag +# regardless of layout. Empty string if the tag table doesn't exist. +tag_expr() { + local schema="$1" table="$2" tag="$3" alias="$4" + local has_col + has_col=$(so_psql -c " + SELECT 1 FROM information_schema.columns + WHERE table_schema='${schema}' AND table_name='${table}_tag' AND column_name='${tag}' + LIMIT 1;") + if [ -n "$has_col" ]; then + echo "${alias}.${tag}" + return + fi + local has_tags + has_tags=$(so_psql -c " + SELECT 1 FROM information_schema.columns + WHERE table_schema='${schema}' AND table_name='${table}_tag' AND column_name='tags' + LIMIT 1;") + if [ -n "$has_tags" ]; then + echo "(${alias}.tags->>'${tag}')" + return + fi + echo "" +} + for schema in $SCHEMAS; do minion="${schema#so_telegraf_}" if [ -n "$FILTER_MINION" ]; then @@ -74,37 +101,45 @@ for schema in $SCHEMAS; do echo " Minion: $minion" echo "====================================================================" - print_metric "$schema" "cpu" " - SELECT 'cpu ' AS metric, - to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round((100 - usage_idle)::numeric, 1) || '% used' - FROM \"${schema}\".cpu - WHERE cpu = 'cpu-total' - ORDER BY time DESC LIMIT 1;" + cpu_tag=$(tag_expr "$schema" "cpu" "cpu" "t") + if [ -n "$cpu_tag" ]; then + print_metric "$schema" "cpu" " + SELECT 'cpu ' AS metric, + to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round((100 - c.usage_idle)::numeric, 1) || '% used' + FROM \"${schema}\".cpu c + JOIN \"${schema}\".cpu_tag t USING (tag_id) + WHERE ${cpu_tag} = 'cpu-total' + ORDER BY c.time DESC LIMIT 1;" + fi print_metric "$schema" "mem" " SELECT 'memory ' AS metric, - to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round(used_percent::numeric, 1) || '% used (' || - pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')' - FROM \"${schema}\".mem - ORDER BY time DESC LIMIT 1;" + to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round(m.used_percent::numeric, 1) || '% used (' || + pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')' + FROM \"${schema}\".mem m + ORDER BY m.time DESC LIMIT 1;" - print_metric "$schema" "disk" " - SELECT 'disk ' || rpad(path, 8) AS metric, - to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round(used_percent::numeric, 1) || '% used (' || - pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')' - FROM \"${schema}\".disk - WHERE time = (SELECT max(time) FROM \"${schema}\".disk) - ORDER BY path;" + disk_path=$(tag_expr "$schema" "disk" "path" "t") + if [ -n "$disk_path" ]; then + print_metric "$schema" "disk" " + SELECT 'disk ' || rpad(${disk_path}, 12) AS metric, + to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round(d.used_percent::numeric, 1) || '% used (' || + pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')' + FROM \"${schema}\".disk d + JOIN \"${schema}\".disk_tag t USING (tag_id) + WHERE d.time = (SELECT max(time) FROM \"${schema}\".disk) + ORDER BY ${disk_path};" + fi print_metric "$schema" "system" " SELECT 'load ' AS metric, - to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - load1 || ' / ' || load5 || ' / ' || load15 || ' (1/5/15m)' - FROM \"${schema}\".system - ORDER BY time DESC LIMIT 1;" + to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)' + FROM \"${schema}\".system s + ORDER BY s.time DESC LIMIT 1;" echo "" done From c12418698975007d3f63c4bc4002d2d876a9a629 Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Wed, 15 Apr 2026 19:45:42 -0400 Subject: [PATCH 4/5] so-log-check: exclude psql ON_ERROR_STOP flag The psql invocation flag '-v ON_ERROR_STOP=1' used by the so-postgres init script gets flagged by so-log-check because the token 'ERROR' matches its error regex. Add to the exclusion list. --- salt/common/tools/sbin/so-log-check | 1 + 1 file changed, 1 insertion(+) diff --git a/salt/common/tools/sbin/so-log-check b/salt/common/tools/sbin/so-log-check index 8c8bbf35c..d8446d6fe 100755 --- a/salt/common/tools/sbin/so-log-check +++ b/salt/common/tools/sbin/so-log-check @@ -229,6 +229,7 @@ if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tcp 127.0.0.1:6791: bind: address already in use" # so-elastic-fleet agent restarting. Seen starting w/ 8.18.8 https://github.com/elastic/kibana/issues/201459 EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint).*user so_kibana lacks the required permissions \[logs-\1" # Known issue with 3 integrations using kibana_system role vs creating unique api creds with proper permissions. EXCLUDED_ERRORS="$EXCLUDED_ERRORS|manifest unknown" # appears in so-dockerregistry log for so-tcpreplay following docker upgrade to 29.2.1-1 + EXCLUDED_ERRORS="$EXCLUDED_ERRORS|-v ON_ERROR_STOP=1" # psql invocation flag from so-postgres init script, not an actual error fi RESULT=0 From 470b3bd4da726e7cce13cbc82aab9b897da4c342 Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Thu, 16 Apr 2026 15:40:54 -0400 Subject: [PATCH 5/5] Comingle Telegraf metrics into shared schema Per-minion schemas cause table count to explode (N minions * M metrics) and the per-minion revocation story isn't worth it when retention is short. Move all minions to a shared 'telegraf' schema while keeping per-minion login credentials for audit. - New so_telegraf NOLOGIN group role owns the telegraf schema; each per-minion role is a member and inherits insert/select via role inheritance - Telegraf connection string uses options='-c role=so_telegraf' so tables auto-created on first write belong to the group role - so-telegraf-trim walks the flat telegraf.* table set instead of per-minion schemas - so-stats-show filters by host tag; CLI arg is now the hostname as tagged by Telegraf rather than a sanitized schema suffix - Also renames so-show-stats -> so-stats-show --- salt/postgres/telegraf_users.sls | 26 +++- salt/postgres/tools/sbin/so-show-stats | 145 ------------------ salt/postgres/tools/sbin/so-stats-show | 170 ++++++++++++++++++++++ salt/postgres/tools/sbin/so-telegraf-trim | 8 +- salt/telegraf/etc/telegraf.conf | 9 +- 5 files changed, 204 insertions(+), 154 deletions(-) delete mode 100644 salt/postgres/tools/sbin/so-show-stats create mode 100644 salt/postgres/tools/sbin/so-stats-show diff --git a/salt/postgres/telegraf_users.sls b/salt/postgres/telegraf_users.sls index d510af9e5..7d62ee7f0 100644 --- a/salt/postgres/telegraf_users.sls +++ b/salt/postgres/telegraf_users.sls @@ -10,6 +10,28 @@ {% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %} {% if TG_OUT in ['POSTGRES', 'BOTH'] %} +# Provision the shared group role and schema once. Every per-minion role is a +# member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf +# (via options='-c role=so_telegraf' in the connection string) so tables created +# on first write are owned by the group role and every member can INSERT/SELECT. +postgres_telegraf_group_role: + cmd.run: + - name: | + docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL' + DO $$ + BEGIN + IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN + CREATE ROLE so_telegraf NOLOGIN; + END IF; + END + $$; + GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf; + CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf; + GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf; + EOSQL + - require: + - docker_container: so-postgres + {% set users = salt['pillar.get']('postgres:auth:users', {}) %} {% for key, entry in users.items() %} {% if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %} @@ -30,10 +52,10 @@ postgres_telegraf_role_{{ u }}: END $$; GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}"; - CREATE SCHEMA IF NOT EXISTS "{{ u }}" AUTHORIZATION "{{ u }}"; + GRANT so_telegraf TO "{{ u }}"; EOSQL - require: - - docker_container: so-postgres + - cmd: postgres_telegraf_group_role {% endif %} {% endfor %} diff --git a/salt/postgres/tools/sbin/so-show-stats b/salt/postgres/tools/sbin/so-show-stats deleted file mode 100644 index 68fd52d00..000000000 --- a/salt/postgres/tools/sbin/so-show-stats +++ /dev/null @@ -1,145 +0,0 @@ -#!/bin/bash - -# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one -# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at -# https://securityonion.net/license; you may not use this file except in compliance with the -# Elastic License 2.0. - -# Point-in-time host metrics from the Telegraf Postgres backend. -# Sanity-check tool for verifying metrics are landing before the grid -# dashboards consume them. - -. /usr/sbin/so-common - -usage() { - cat </dev/null | cut -d\| -f1 | grep -qw so_telegraf; then - echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?" - exit 2 -fi - -# List telegraf schemas (role-per-minion naming convention: so_telegraf_) -SCHEMAS=$(so_psql -c "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE 'so_telegraf_%' ORDER BY schema_name;") - -if [ -z "$SCHEMAS" ]; then - echo "No minion schemas found in so_telegraf." - exit 0 -fi - -print_metric() { - local schema="$1" table="$2" query="$3" - # Confirm table exists in this schema before querying - local exists - exists=$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${schema}' AND table_name='${table}' LIMIT 1;") - [ -z "$exists" ] && return 0 - so_psql -c "$query" -} - -# Telegraf's postgresql output stores tag values either as individual columns -# on the _tag table or as a single JSONB "tags" column, depending on -# plugin version. Returns a SQL expression that extracts the named tag -# regardless of layout. Empty string if the tag table doesn't exist. -tag_expr() { - local schema="$1" table="$2" tag="$3" alias="$4" - local has_col - has_col=$(so_psql -c " - SELECT 1 FROM information_schema.columns - WHERE table_schema='${schema}' AND table_name='${table}_tag' AND column_name='${tag}' - LIMIT 1;") - if [ -n "$has_col" ]; then - echo "${alias}.${tag}" - return - fi - local has_tags - has_tags=$(so_psql -c " - SELECT 1 FROM information_schema.columns - WHERE table_schema='${schema}' AND table_name='${table}_tag' AND column_name='tags' - LIMIT 1;") - if [ -n "$has_tags" ]; then - echo "(${alias}.tags->>'${tag}')" - return - fi - echo "" -} - -for schema in $SCHEMAS; do - minion="${schema#so_telegraf_}" - if [ -n "$FILTER_MINION" ]; then - # Compare against the sanitized form used in schema names - want=$(echo "$FILTER_MINION" | tr '.-' '_' | tr '[:upper:]' '[:lower:]') - [ "$minion" != "$want" ] && continue - fi - - echo "====================================================================" - echo " Minion: $minion" - echo "====================================================================" - - cpu_tag=$(tag_expr "$schema" "cpu" "cpu" "t") - if [ -n "$cpu_tag" ]; then - print_metric "$schema" "cpu" " - SELECT 'cpu ' AS metric, - to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round((100 - c.usage_idle)::numeric, 1) || '% used' - FROM \"${schema}\".cpu c - JOIN \"${schema}\".cpu_tag t USING (tag_id) - WHERE ${cpu_tag} = 'cpu-total' - ORDER BY c.time DESC LIMIT 1;" - fi - - print_metric "$schema" "mem" " - SELECT 'memory ' AS metric, - to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round(m.used_percent::numeric, 1) || '% used (' || - pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')' - FROM \"${schema}\".mem m - ORDER BY m.time DESC LIMIT 1;" - - disk_path=$(tag_expr "$schema" "disk" "path" "t") - if [ -n "$disk_path" ]; then - print_metric "$schema" "disk" " - SELECT 'disk ' || rpad(${disk_path}, 12) AS metric, - to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round(d.used_percent::numeric, 1) || '% used (' || - pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')' - FROM \"${schema}\".disk d - JOIN \"${schema}\".disk_tag t USING (tag_id) - WHERE d.time = (SELECT max(time) FROM \"${schema}\".disk) - ORDER BY ${disk_path};" - fi - - print_metric "$schema" "system" " - SELECT 'load ' AS metric, - to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)' - FROM \"${schema}\".system s - ORDER BY s.time DESC LIMIT 1;" - - echo "" -done diff --git a/salt/postgres/tools/sbin/so-stats-show b/salt/postgres/tools/sbin/so-stats-show new file mode 100644 index 000000000..fd8dff39f --- /dev/null +++ b/salt/postgres/tools/sbin/so-stats-show @@ -0,0 +1,170 @@ +#!/bin/bash + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +# Point-in-time host metrics from the Telegraf Postgres backend. +# Sanity-check tool for verifying metrics are landing before the grid +# dashboards consume them. + +. /usr/sbin/so-common + +usage() { + cat </dev/null | cut -d\| -f1 | grep -qw so_telegraf; then + echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?" + exit 2 +fi + +# Telegraf's postgresql output stores tag values either as individual columns +# on the _tag table or as a single JSONB "tags" column, depending on +# plugin version. Returns a SQL expression that extracts the named tag +# regardless of layout. Empty string if the tag table doesn't exist. +tag_expr() { + local table="$1" tag="$2" alias="$3" + local has_col + has_col=$(so_psql -c " + SELECT 1 FROM information_schema.columns + WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='${tag}' + LIMIT 1;") + if [ -n "$has_col" ]; then + echo "${alias}.${tag}" + return + fi + local has_tags + has_tags=$(so_psql -c " + SELECT 1 FROM information_schema.columns + WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='tags' + LIMIT 1;") + if [ -n "$has_tags" ]; then + echo "(${alias}.tags->>'${tag}')" + return + fi + echo "" +} + +table_exists() { + local table="$1" + [ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ] +} + +# Discover hosts from cpu_tag (every minion reports cpu). +host_expr=$(tag_expr "cpu" "host" "t") +if [ -z "$host_expr" ]; then + echo "Unable to determine host tag column on ${SCHEMA}.cpu_tag. Has Telegraf written any rows yet?" + exit 0 +fi + +HOSTS=$(so_psql -c " + SELECT DISTINCT ${host_expr} + FROM \"${SCHEMA}\".cpu_tag t + WHERE ${host_expr} IS NOT NULL + ORDER BY 1;") + +if [ -z "$HOSTS" ]; then + echo "No hosts found in ${SCHEMA}. Is Telegraf configured to write to Postgres?" + exit 0 +fi + +print_metric() { + local query="$1" + so_psql -c "$query" +} + +for host in $HOSTS; do + if [ -n "$FILTER_HOST" ] && [ "$host" != "$FILTER_HOST" ]; then + continue + fi + + echo "====================================================================" + echo " Host: $host" + echo "====================================================================" + + cpu_host=$(tag_expr "cpu" "host" "t") + cpu_tag=$(tag_expr "cpu" "cpu" "t") + if [ -n "$cpu_host" ] && [ -n "$cpu_tag" ]; then + print_metric " + SELECT 'cpu ' AS metric, + to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round((100 - c.usage_idle)::numeric, 1) || '% used' + FROM \"${SCHEMA}\".cpu c + JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id) + WHERE ${cpu_host} = '${host}' AND ${cpu_tag} = 'cpu-total' + ORDER BY c.time DESC LIMIT 1;" + fi + + mem_host=$(tag_expr "mem" "host" "t") + if [ -n "$mem_host" ] && table_exists "mem"; then + print_metric " + SELECT 'memory ' AS metric, + to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round(m.used_percent::numeric, 1) || '% used (' || + pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')' + FROM \"${SCHEMA}\".mem m + JOIN \"${SCHEMA}\".mem_tag t USING (tag_id) + WHERE ${mem_host} = '${host}' + ORDER BY m.time DESC LIMIT 1;" + fi + + disk_host=$(tag_expr "disk" "host" "t") + disk_path=$(tag_expr "disk" "path" "t") + if [ -n "$disk_host" ] && [ -n "$disk_path" ] && table_exists "disk"; then + print_metric " + SELECT 'disk ' || rpad(${disk_path}, 12) AS metric, + to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + round(d.used_percent::numeric, 1) || '% used (' || + pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')' + FROM \"${SCHEMA}\".disk d + JOIN \"${SCHEMA}\".disk_tag t USING (tag_id) + WHERE ${disk_host} = '${host}' + AND d.time = (SELECT max(d2.time) + FROM \"${SCHEMA}\".disk d2 + JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id) + WHERE ${disk_host/t./t2.} = '${host}') + ORDER BY ${disk_path};" + fi + + sys_host=$(tag_expr "system" "host" "t") + if [ -n "$sys_host" ] && table_exists "system"; then + print_metric " + SELECT 'load ' AS metric, + to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, + s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)' + FROM \"${SCHEMA}\".system s + JOIN \"${SCHEMA}\".system_tag t USING (tag_id) + WHERE ${sys_host} = '${host}' + ORDER BY s.time DESC LIMIT 1;" + fi + + echo "" +done diff --git a/salt/postgres/tools/sbin/so-telegraf-trim b/salt/postgres/tools/sbin/so-telegraf-trim index 0bf53c1d8..664469d0c 100644 --- a/salt/postgres/tools/sbin/so-telegraf-trim +++ b/salt/postgres/tools/sbin/so-telegraf-trim @@ -63,15 +63,15 @@ log "Trimming rows older than ${DAYS} days (dry_run=${DRY_RUN})." TOTAL_DELETED=0 -# One row per (schema, table) we might want to trim. -# Column name is 'time' for all telegraf output plugin tables; skip metadata -# tables (tag_* used for tags_as_foreign_keys). +# Every metric table in the shared telegraf schema has a 'time' column. +# Tag tables (_tag) don't, so filtering on the column presence is +# enough to scope the trim to metric tables only. ROWS=$(so_psql -c " SELECT table_schema || '.' || table_name FROM information_schema.columns WHERE column_name = 'time' AND data_type IN ('timestamp with time zone', 'timestamp without time zone') - AND table_schema LIKE 'so_telegraf_%' + AND table_schema = 'telegraf' ORDER BY 1;") if [ -z "$ROWS" ]; then diff --git a/salt/telegraf/etc/telegraf.conf b/salt/telegraf/etc/telegraf.conf index 4cdd81f20..4f0c279cc 100644 --- a/salt/telegraf/etc/telegraf.conf +++ b/salt/telegraf/etc/telegraf.conf @@ -94,10 +94,13 @@ {%- endif %} {%- if TG_OUT in ['POSTGRES', 'BOTH'] %} -# Configuration for sending metrics to PostgreSQL +# Configuration for sending metrics to PostgreSQL. +# options='-c role=so_telegraf' makes every connection SET ROLE to the shared +# group role so tables created on first write are owned by so_telegraf, and +# all per-minion members can INSERT/SELECT them via role inheritance. [[outputs.postgresql]] - connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt" - schema = "{{ PG_USER }}" + connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'" + schema = "telegraf" tags_as_foreign_keys = true {%- endif %}