Files
securityonion/salt/postgres/tools/sbin/so-stats-show
T
Mike Reeves 9fe53d9ccc Use JSONB for Telegraf fields/tags to avoid 1600-column limit
High-cardinality inputs (docker, procstat, kafka) trigger ALTER TABLE
ADD COLUMN on every new field name, and with all minions writing into
a shared 'telegraf' schema the metric tables hit Postgres's 1600-column
per-table ceiling quickly. Setting fields_as_jsonb and tags_as_jsonb on
the postgresql output keeps metric tables fixed at (time, tag_id,
fields jsonb) and tag tables at (tag_id, tags jsonb).

- so-stats-show rewritten to use JSONB accessors
  ((fields->>'x')::numeric, tags->>'host', etc.) and cast memory/disk
  sizes to bigint so pg_size_pretty works
- Drop regex/regexFailureMessage from telegraf_output SOC UI entry to
  match the convention upstream used when removing them from
  mdengine/pcapengine/pipeline; options: list drives validation
2026-04-16 17:02:21 -04:00

145 lines
4.6 KiB
Bash

#!/bin/bash
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
# https://securityonion.net/license; you may not use this file except in compliance with the
# Elastic License 2.0.
# Point-in-time host metrics from the Telegraf Postgres backend.
# Sanity-check tool for verifying metrics are landing before the grid
# dashboards consume them.
#
# Assumes Telegraf's postgresql output is configured with
# tags_as_foreign_keys = true, tags_as_jsonb = true, fields_as_jsonb = true,
# so metric tables are (time, tag_id, fields jsonb) and tag tables are
# (tag_id, tags jsonb).
. /usr/sbin/so-common
usage() {
cat <<EOF
Usage: $0 [host]
Shows the most recent CPU, memory, disk, and load metrics for each host
from the so_telegraf Postgres database. Without an argument, reports on
every host that has data. With a host, limits output to that one.
Requires: sudo, so-postgres running, global.telegraf_output set to
POSTGRES or BOTH.
EOF
exit 1
}
if [ "$(id -u)" -ne 0 ]; then
echo "This script must be run using sudo!"
exit 1
fi
case "${1:-}" in
-h|--help) usage ;;
esac
FILTER_HOST="${1:-}"
SCHEMA="telegraf"
so_psql() {
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
}
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?"
exit 2
fi
table_exists() {
local table="$1"
[ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ]
}
# Discover hosts from cpu_tag (every minion reports cpu).
if ! table_exists "cpu_tag"; then
echo "${SCHEMA}.cpu_tag not found. Has Telegraf written any rows yet?"
exit 0
fi
HOSTS=$(so_psql -c "
SELECT DISTINCT tags->>'host'
FROM \"${SCHEMA}\".cpu_tag
WHERE tags ? 'host'
ORDER BY 1;")
if [ -z "$HOSTS" ]; then
echo "No hosts found in ${SCHEMA}. Is Telegraf configured to write to Postgres?"
exit 0
fi
print_metric() {
so_psql -c "$1"
}
for host in $HOSTS; do
if [ -n "$FILTER_HOST" ] && [ "$host" != "$FILTER_HOST" ]; then
continue
fi
echo "===================================================================="
echo " Host: $host"
echo "===================================================================="
if table_exists "cpu"; then
print_metric "
SELECT 'cpu ' AS metric,
to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
round((100 - (c.fields->>'usage_idle')::numeric), 1) || '% used'
FROM \"${SCHEMA}\".cpu c
JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id)
WHERE t.tags->>'host' = '${host}' AND t.tags->>'cpu' = 'cpu-total'
ORDER BY c.time DESC LIMIT 1;"
fi
if table_exists "mem"; then
print_metric "
SELECT 'memory ' AS metric,
to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
round((m.fields->>'used_percent')::numeric, 1) || '% used (' ||
pg_size_pretty((m.fields->>'used')::bigint) || ' of ' ||
pg_size_pretty((m.fields->>'total')::bigint) || ')'
FROM \"${SCHEMA}\".mem m
JOIN \"${SCHEMA}\".mem_tag t USING (tag_id)
WHERE t.tags->>'host' = '${host}'
ORDER BY m.time DESC LIMIT 1;"
fi
if table_exists "disk"; then
print_metric "
SELECT 'disk ' || rpad(t.tags->>'path', 12) AS metric,
to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
round((d.fields->>'used_percent')::numeric, 1) || '% used (' ||
pg_size_pretty((d.fields->>'used')::bigint) || ' of ' ||
pg_size_pretty((d.fields->>'total')::bigint) || ')'
FROM \"${SCHEMA}\".disk d
JOIN \"${SCHEMA}\".disk_tag t USING (tag_id)
WHERE t.tags->>'host' = '${host}'
AND d.time = (SELECT max(d2.time)
FROM \"${SCHEMA}\".disk d2
JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id)
WHERE t2.tags->>'host' = '${host}')
ORDER BY t.tags->>'path';"
fi
if table_exists "system"; then
print_metric "
SELECT 'load ' AS metric,
to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
(s.fields->>'load1') || ' / ' ||
(s.fields->>'load5') || ' / ' ||
(s.fields->>'load15') || ' (1/5/15m)'
FROM \"${SCHEMA}\".system s
JOIN \"${SCHEMA}\".system_tag t USING (tag_id)
WHERE t.tags->>'host' = '${host}'
ORDER BY s.time DESC LIMIT 1;"
fi
echo ""
done