From 9fe53d9cccc2242ca056d927df66852f33d9be6b Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Thu, 16 Apr 2026 17:02:21 -0400 Subject: [PATCH] Use JSONB for Telegraf fields/tags to avoid 1600-column limit High-cardinality inputs (docker, procstat, kafka) trigger ALTER TABLE ADD COLUMN on every new field name, and with all minions writing into a shared 'telegraf' schema the metric tables hit Postgres's 1600-column per-table ceiling quickly. Setting fields_as_jsonb and tags_as_jsonb on the postgresql output keeps metric tables fixed at (time, tag_id, fields jsonb) and tag tables at (tag_id, tags jsonb). - so-stats-show rewritten to use JSONB accessors ((fields->>'x')::numeric, tags->>'host', etc.) and cast memory/disk sizes to bigint so pg_size_pretty works - Drop regex/regexFailureMessage from telegraf_output SOC UI entry to match the convention upstream used when removing them from mdengine/pcapengine/pipeline; options: list drives validation --- salt/global/soc_global.yaml | 2 - salt/postgres/tools/sbin/so-stats-show | 96 ++++++++++---------------- salt/telegraf/etc/telegraf.conf | 5 ++ 3 files changed, 40 insertions(+), 63 deletions(-) diff --git a/salt/global/soc_global.yaml b/salt/global/soc_global.yaml index 3430ef777..61646168f 100644 --- a/salt/global/soc_global.yaml +++ b/salt/global/soc_global.yaml @@ -61,12 +61,10 @@ global: advanced: True telegraf_output: description: Selects the backend(s) Telegraf writes metrics to. INFLUXDB keeps the current behavior; POSTGRES writes to the grid's Postgres instance; BOTH dual-writes for migration validation. - regex: ^(INFLUXDB|POSTGRES|BOTH)$ options: - INFLUXDB - POSTGRES - BOTH - regexFailureMessage: You must enter INFLUXDB, POSTGRES, or BOTH. global: True advanced: True helpLink: influxdb diff --git a/salt/postgres/tools/sbin/so-stats-show b/salt/postgres/tools/sbin/so-stats-show index fd8dff39f..bfc81887a 100644 --- a/salt/postgres/tools/sbin/so-stats-show +++ b/salt/postgres/tools/sbin/so-stats-show @@ -8,16 +8,21 @@ # Point-in-time host metrics from the Telegraf Postgres backend. # Sanity-check tool for verifying metrics are landing before the grid # dashboards consume them. +# +# Assumes Telegraf's postgresql output is configured with +# tags_as_foreign_keys = true, tags_as_jsonb = true, fields_as_jsonb = true, +# so metric tables are (time, tag_id, fields jsonb) and tag tables are +# (tag_id, tags jsonb). . /usr/sbin/so-common usage() { cat </dev/null | cut -d\| -f1 | exit 2 fi -# Telegraf's postgresql output stores tag values either as individual columns -# on the _tag table or as a single JSONB "tags" column, depending on -# plugin version. Returns a SQL expression that extracts the named tag -# regardless of layout. Empty string if the tag table doesn't exist. -tag_expr() { - local table="$1" tag="$2" alias="$3" - local has_col - has_col=$(so_psql -c " - SELECT 1 FROM information_schema.columns - WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='${tag}' - LIMIT 1;") - if [ -n "$has_col" ]; then - echo "${alias}.${tag}" - return - fi - local has_tags - has_tags=$(so_psql -c " - SELECT 1 FROM information_schema.columns - WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='tags' - LIMIT 1;") - if [ -n "$has_tags" ]; then - echo "(${alias}.tags->>'${tag}')" - return - fi - echo "" -} - table_exists() { local table="$1" [ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ] } # Discover hosts from cpu_tag (every minion reports cpu). -host_expr=$(tag_expr "cpu" "host" "t") -if [ -z "$host_expr" ]; then - echo "Unable to determine host tag column on ${SCHEMA}.cpu_tag. Has Telegraf written any rows yet?" +if ! table_exists "cpu_tag"; then + echo "${SCHEMA}.cpu_tag not found. Has Telegraf written any rows yet?" exit 0 fi HOSTS=$(so_psql -c " - SELECT DISTINCT ${host_expr} - FROM \"${SCHEMA}\".cpu_tag t - WHERE ${host_expr} IS NOT NULL + SELECT DISTINCT tags->>'host' + FROM \"${SCHEMA}\".cpu_tag + WHERE tags ? 'host' ORDER BY 1;") if [ -z "$HOSTS" ]; then @@ -97,8 +74,7 @@ if [ -z "$HOSTS" ]; then fi print_metric() { - local query="$1" - so_psql -c "$query" + so_psql -c "$1" } for host in $HOSTS; do @@ -110,59 +86,57 @@ for host in $HOSTS; do echo " Host: $host" echo "====================================================================" - cpu_host=$(tag_expr "cpu" "host" "t") - cpu_tag=$(tag_expr "cpu" "cpu" "t") - if [ -n "$cpu_host" ] && [ -n "$cpu_tag" ]; then + if table_exists "cpu"; then print_metric " SELECT 'cpu ' AS metric, to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round((100 - c.usage_idle)::numeric, 1) || '% used' + round((100 - (c.fields->>'usage_idle')::numeric), 1) || '% used' FROM \"${SCHEMA}\".cpu c JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id) - WHERE ${cpu_host} = '${host}' AND ${cpu_tag} = 'cpu-total' + WHERE t.tags->>'host' = '${host}' AND t.tags->>'cpu' = 'cpu-total' ORDER BY c.time DESC LIMIT 1;" fi - mem_host=$(tag_expr "mem" "host" "t") - if [ -n "$mem_host" ] && table_exists "mem"; then + if table_exists "mem"; then print_metric " SELECT 'memory ' AS metric, to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round(m.used_percent::numeric, 1) || '% used (' || - pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')' + round((m.fields->>'used_percent')::numeric, 1) || '% used (' || + pg_size_pretty((m.fields->>'used')::bigint) || ' of ' || + pg_size_pretty((m.fields->>'total')::bigint) || ')' FROM \"${SCHEMA}\".mem m JOIN \"${SCHEMA}\".mem_tag t USING (tag_id) - WHERE ${mem_host} = '${host}' + WHERE t.tags->>'host' = '${host}' ORDER BY m.time DESC LIMIT 1;" fi - disk_host=$(tag_expr "disk" "host" "t") - disk_path=$(tag_expr "disk" "path" "t") - if [ -n "$disk_host" ] && [ -n "$disk_path" ] && table_exists "disk"; then + if table_exists "disk"; then print_metric " - SELECT 'disk ' || rpad(${disk_path}, 12) AS metric, + SELECT 'disk ' || rpad(t.tags->>'path', 12) AS metric, to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - round(d.used_percent::numeric, 1) || '% used (' || - pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')' + round((d.fields->>'used_percent')::numeric, 1) || '% used (' || + pg_size_pretty((d.fields->>'used')::bigint) || ' of ' || + pg_size_pretty((d.fields->>'total')::bigint) || ')' FROM \"${SCHEMA}\".disk d JOIN \"${SCHEMA}\".disk_tag t USING (tag_id) - WHERE ${disk_host} = '${host}' + WHERE t.tags->>'host' = '${host}' AND d.time = (SELECT max(d2.time) FROM \"${SCHEMA}\".disk d2 JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id) - WHERE ${disk_host/t./t2.} = '${host}') - ORDER BY ${disk_path};" + WHERE t2.tags->>'host' = '${host}') + ORDER BY t.tags->>'path';" fi - sys_host=$(tag_expr "system" "host" "t") - if [ -n "$sys_host" ] && table_exists "system"; then + if table_exists "system"; then print_metric " SELECT 'load ' AS metric, to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts, - s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)' + (s.fields->>'load1') || ' / ' || + (s.fields->>'load5') || ' / ' || + (s.fields->>'load15') || ' (1/5/15m)' FROM \"${SCHEMA}\".system s JOIN \"${SCHEMA}\".system_tag t USING (tag_id) - WHERE ${sys_host} = '${host}' + WHERE t.tags->>'host' = '${host}' ORDER BY s.time DESC LIMIT 1;" fi diff --git a/salt/telegraf/etc/telegraf.conf b/salt/telegraf/etc/telegraf.conf index 4f0c279cc..aa5f2a007 100644 --- a/salt/telegraf/etc/telegraf.conf +++ b/salt/telegraf/etc/telegraf.conf @@ -98,10 +98,15 @@ # options='-c role=so_telegraf' makes every connection SET ROLE to the shared # group role so tables created on first write are owned by so_telegraf, and # all per-minion members can INSERT/SELECT them via role inheritance. +# fields_as_jsonb/tags_as_jsonb keep metric tables at a fixed column count so +# high-cardinality inputs (docker, procstat, kafka) don't blow past the +# Postgres 1600-column-per-table limit. [[outputs.postgresql]] connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'" schema = "telegraf" tags_as_foreign_keys = true + tags_as_jsonb = true + fields_as_jsonb = true {%- endif %} ###############################################################################