mirror of
https://github.com/Security-Onion-Solutions/securityonion.git
synced 2026-05-29 14:35:22 +02:00
Use JSONB for Telegraf fields/tags to avoid 1600-column limit
High-cardinality inputs (docker, procstat, kafka) trigger ALTER TABLE ADD COLUMN on every new field name, and with all minions writing into a shared 'telegraf' schema the metric tables hit Postgres's 1600-column per-table ceiling quickly. Setting fields_as_jsonb and tags_as_jsonb on the postgresql output keeps metric tables fixed at (time, tag_id, fields jsonb) and tag tables at (tag_id, tags jsonb). - so-stats-show rewritten to use JSONB accessors ((fields->>'x')::numeric, tags->>'host', etc.) and cast memory/disk sizes to bigint so pg_size_pretty works - Drop regex/regexFailureMessage from telegraf_output SOC UI entry to match the convention upstream used when removing them from mdengine/pcapengine/pipeline; options: list drives validation
This commit is contained in:
@@ -61,12 +61,10 @@ global:
|
|||||||
advanced: True
|
advanced: True
|
||||||
telegraf_output:
|
telegraf_output:
|
||||||
description: Selects the backend(s) Telegraf writes metrics to. INFLUXDB keeps the current behavior; POSTGRES writes to the grid's Postgres instance; BOTH dual-writes for migration validation.
|
description: Selects the backend(s) Telegraf writes metrics to. INFLUXDB keeps the current behavior; POSTGRES writes to the grid's Postgres instance; BOTH dual-writes for migration validation.
|
||||||
regex: ^(INFLUXDB|POSTGRES|BOTH)$
|
|
||||||
options:
|
options:
|
||||||
- INFLUXDB
|
- INFLUXDB
|
||||||
- POSTGRES
|
- POSTGRES
|
||||||
- BOTH
|
- BOTH
|
||||||
regexFailureMessage: You must enter INFLUXDB, POSTGRES, or BOTH.
|
|
||||||
global: True
|
global: True
|
||||||
advanced: True
|
advanced: True
|
||||||
helpLink: influxdb
|
helpLink: influxdb
|
||||||
|
|||||||
@@ -8,16 +8,21 @@
|
|||||||
# Point-in-time host metrics from the Telegraf Postgres backend.
|
# Point-in-time host metrics from the Telegraf Postgres backend.
|
||||||
# Sanity-check tool for verifying metrics are landing before the grid
|
# Sanity-check tool for verifying metrics are landing before the grid
|
||||||
# dashboards consume them.
|
# dashboards consume them.
|
||||||
|
#
|
||||||
|
# Assumes Telegraf's postgresql output is configured with
|
||||||
|
# tags_as_foreign_keys = true, tags_as_jsonb = true, fields_as_jsonb = true,
|
||||||
|
# so metric tables are (time, tag_id, fields jsonb) and tag tables are
|
||||||
|
# (tag_id, tags jsonb).
|
||||||
|
|
||||||
. /usr/sbin/so-common
|
. /usr/sbin/so-common
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
Usage: $0 [minion_id]
|
Usage: $0 [host]
|
||||||
|
|
||||||
Shows the most recent CPU, memory, disk, and load metrics for each minion
|
Shows the most recent CPU, memory, disk, and load metrics for each host
|
||||||
from the so_telegraf Postgres database. Without an argument, reports on
|
from the so_telegraf Postgres database. Without an argument, reports on
|
||||||
every minion that has data. With a minion_id, limits output to that one.
|
every host that has data. With a host, limits output to that one.
|
||||||
|
|
||||||
Requires: sudo, so-postgres running, global.telegraf_output set to
|
Requires: sudo, so-postgres running, global.telegraf_output set to
|
||||||
POSTGRES or BOTH.
|
POSTGRES or BOTH.
|
||||||
@@ -46,49 +51,21 @@ if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 |
|
|||||||
exit 2
|
exit 2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Telegraf's postgresql output stores tag values either as individual columns
|
|
||||||
# on the <metric>_tag table or as a single JSONB "tags" column, depending on
|
|
||||||
# plugin version. Returns a SQL expression that extracts the named tag
|
|
||||||
# regardless of layout. Empty string if the tag table doesn't exist.
|
|
||||||
tag_expr() {
|
|
||||||
local table="$1" tag="$2" alias="$3"
|
|
||||||
local has_col
|
|
||||||
has_col=$(so_psql -c "
|
|
||||||
SELECT 1 FROM information_schema.columns
|
|
||||||
WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='${tag}'
|
|
||||||
LIMIT 1;")
|
|
||||||
if [ -n "$has_col" ]; then
|
|
||||||
echo "${alias}.${tag}"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
local has_tags
|
|
||||||
has_tags=$(so_psql -c "
|
|
||||||
SELECT 1 FROM information_schema.columns
|
|
||||||
WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='tags'
|
|
||||||
LIMIT 1;")
|
|
||||||
if [ -n "$has_tags" ]; then
|
|
||||||
echo "(${alias}.tags->>'${tag}')"
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
echo ""
|
|
||||||
}
|
|
||||||
|
|
||||||
table_exists() {
|
table_exists() {
|
||||||
local table="$1"
|
local table="$1"
|
||||||
[ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ]
|
[ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ]
|
||||||
}
|
}
|
||||||
|
|
||||||
# Discover hosts from cpu_tag (every minion reports cpu).
|
# Discover hosts from cpu_tag (every minion reports cpu).
|
||||||
host_expr=$(tag_expr "cpu" "host" "t")
|
if ! table_exists "cpu_tag"; then
|
||||||
if [ -z "$host_expr" ]; then
|
echo "${SCHEMA}.cpu_tag not found. Has Telegraf written any rows yet?"
|
||||||
echo "Unable to determine host tag column on ${SCHEMA}.cpu_tag. Has Telegraf written any rows yet?"
|
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
HOSTS=$(so_psql -c "
|
HOSTS=$(so_psql -c "
|
||||||
SELECT DISTINCT ${host_expr}
|
SELECT DISTINCT tags->>'host'
|
||||||
FROM \"${SCHEMA}\".cpu_tag t
|
FROM \"${SCHEMA}\".cpu_tag
|
||||||
WHERE ${host_expr} IS NOT NULL
|
WHERE tags ? 'host'
|
||||||
ORDER BY 1;")
|
ORDER BY 1;")
|
||||||
|
|
||||||
if [ -z "$HOSTS" ]; then
|
if [ -z "$HOSTS" ]; then
|
||||||
@@ -97,8 +74,7 @@ if [ -z "$HOSTS" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
print_metric() {
|
print_metric() {
|
||||||
local query="$1"
|
so_psql -c "$1"
|
||||||
so_psql -c "$query"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for host in $HOSTS; do
|
for host in $HOSTS; do
|
||||||
@@ -110,59 +86,57 @@ for host in $HOSTS; do
|
|||||||
echo " Host: $host"
|
echo " Host: $host"
|
||||||
echo "===================================================================="
|
echo "===================================================================="
|
||||||
|
|
||||||
cpu_host=$(tag_expr "cpu" "host" "t")
|
if table_exists "cpu"; then
|
||||||
cpu_tag=$(tag_expr "cpu" "cpu" "t")
|
|
||||||
if [ -n "$cpu_host" ] && [ -n "$cpu_tag" ]; then
|
|
||||||
print_metric "
|
print_metric "
|
||||||
SELECT 'cpu ' AS metric,
|
SELECT 'cpu ' AS metric,
|
||||||
to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||||
round((100 - c.usage_idle)::numeric, 1) || '% used'
|
round((100 - (c.fields->>'usage_idle')::numeric), 1) || '% used'
|
||||||
FROM \"${SCHEMA}\".cpu c
|
FROM \"${SCHEMA}\".cpu c
|
||||||
JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id)
|
JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id)
|
||||||
WHERE ${cpu_host} = '${host}' AND ${cpu_tag} = 'cpu-total'
|
WHERE t.tags->>'host' = '${host}' AND t.tags->>'cpu' = 'cpu-total'
|
||||||
ORDER BY c.time DESC LIMIT 1;"
|
ORDER BY c.time DESC LIMIT 1;"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mem_host=$(tag_expr "mem" "host" "t")
|
if table_exists "mem"; then
|
||||||
if [ -n "$mem_host" ] && table_exists "mem"; then
|
|
||||||
print_metric "
|
print_metric "
|
||||||
SELECT 'memory ' AS metric,
|
SELECT 'memory ' AS metric,
|
||||||
to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||||
round(m.used_percent::numeric, 1) || '% used (' ||
|
round((m.fields->>'used_percent')::numeric, 1) || '% used (' ||
|
||||||
pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')'
|
pg_size_pretty((m.fields->>'used')::bigint) || ' of ' ||
|
||||||
|
pg_size_pretty((m.fields->>'total')::bigint) || ')'
|
||||||
FROM \"${SCHEMA}\".mem m
|
FROM \"${SCHEMA}\".mem m
|
||||||
JOIN \"${SCHEMA}\".mem_tag t USING (tag_id)
|
JOIN \"${SCHEMA}\".mem_tag t USING (tag_id)
|
||||||
WHERE ${mem_host} = '${host}'
|
WHERE t.tags->>'host' = '${host}'
|
||||||
ORDER BY m.time DESC LIMIT 1;"
|
ORDER BY m.time DESC LIMIT 1;"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
disk_host=$(tag_expr "disk" "host" "t")
|
if table_exists "disk"; then
|
||||||
disk_path=$(tag_expr "disk" "path" "t")
|
|
||||||
if [ -n "$disk_host" ] && [ -n "$disk_path" ] && table_exists "disk"; then
|
|
||||||
print_metric "
|
print_metric "
|
||||||
SELECT 'disk ' || rpad(${disk_path}, 12) AS metric,
|
SELECT 'disk ' || rpad(t.tags->>'path', 12) AS metric,
|
||||||
to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||||
round(d.used_percent::numeric, 1) || '% used (' ||
|
round((d.fields->>'used_percent')::numeric, 1) || '% used (' ||
|
||||||
pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')'
|
pg_size_pretty((d.fields->>'used')::bigint) || ' of ' ||
|
||||||
|
pg_size_pretty((d.fields->>'total')::bigint) || ')'
|
||||||
FROM \"${SCHEMA}\".disk d
|
FROM \"${SCHEMA}\".disk d
|
||||||
JOIN \"${SCHEMA}\".disk_tag t USING (tag_id)
|
JOIN \"${SCHEMA}\".disk_tag t USING (tag_id)
|
||||||
WHERE ${disk_host} = '${host}'
|
WHERE t.tags->>'host' = '${host}'
|
||||||
AND d.time = (SELECT max(d2.time)
|
AND d.time = (SELECT max(d2.time)
|
||||||
FROM \"${SCHEMA}\".disk d2
|
FROM \"${SCHEMA}\".disk d2
|
||||||
JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id)
|
JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id)
|
||||||
WHERE ${disk_host/t./t2.} = '${host}')
|
WHERE t2.tags->>'host' = '${host}')
|
||||||
ORDER BY ${disk_path};"
|
ORDER BY t.tags->>'path';"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
sys_host=$(tag_expr "system" "host" "t")
|
if table_exists "system"; then
|
||||||
if [ -n "$sys_host" ] && table_exists "system"; then
|
|
||||||
print_metric "
|
print_metric "
|
||||||
SELECT 'load ' AS metric,
|
SELECT 'load ' AS metric,
|
||||||
to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||||
s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)'
|
(s.fields->>'load1') || ' / ' ||
|
||||||
|
(s.fields->>'load5') || ' / ' ||
|
||||||
|
(s.fields->>'load15') || ' (1/5/15m)'
|
||||||
FROM \"${SCHEMA}\".system s
|
FROM \"${SCHEMA}\".system s
|
||||||
JOIN \"${SCHEMA}\".system_tag t USING (tag_id)
|
JOIN \"${SCHEMA}\".system_tag t USING (tag_id)
|
||||||
WHERE ${sys_host} = '${host}'
|
WHERE t.tags->>'host' = '${host}'
|
||||||
ORDER BY s.time DESC LIMIT 1;"
|
ORDER BY s.time DESC LIMIT 1;"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -98,10 +98,15 @@
|
|||||||
# options='-c role=so_telegraf' makes every connection SET ROLE to the shared
|
# options='-c role=so_telegraf' makes every connection SET ROLE to the shared
|
||||||
# group role so tables created on first write are owned by so_telegraf, and
|
# group role so tables created on first write are owned by so_telegraf, and
|
||||||
# all per-minion members can INSERT/SELECT them via role inheritance.
|
# all per-minion members can INSERT/SELECT them via role inheritance.
|
||||||
|
# fields_as_jsonb/tags_as_jsonb keep metric tables at a fixed column count so
|
||||||
|
# high-cardinality inputs (docker, procstat, kafka) don't blow past the
|
||||||
|
# Postgres 1600-column-per-table limit.
|
||||||
[[outputs.postgresql]]
|
[[outputs.postgresql]]
|
||||||
connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'"
|
connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'"
|
||||||
schema = "telegraf"
|
schema = "telegraf"
|
||||||
tags_as_foreign_keys = true
|
tags_as_foreign_keys = true
|
||||||
|
tags_as_jsonb = true
|
||||||
|
fields_as_jsonb = true
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|||||||
Reference in New Issue
Block a user