mirror of
https://github.com/Security-Onion-Solutions/securityonion.git
synced 2026-05-06 11:28:46 +02:00
Comingle Telegraf metrics into shared schema
Per-minion schemas cause table count to explode (N minions * M metrics) and the per-minion revocation story isn't worth it when retention is short. Move all minions to a shared 'telegraf' schema while keeping per-minion login credentials for audit. - New so_telegraf NOLOGIN group role owns the telegraf schema; each per-minion role is a member and inherits insert/select via role inheritance - Telegraf connection string uses options='-c role=so_telegraf' so tables auto-created on first write belong to the group role - so-telegraf-trim walks the flat telegraf.* table set instead of per-minion schemas - so-stats-show filters by host tag; CLI arg is now the hostname as tagged by Telegraf rather than a sanitized schema suffix - Also renames so-show-stats -> so-stats-show
This commit is contained in:
@@ -10,6 +10,28 @@
|
||||
{% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %}
|
||||
{% if TG_OUT in ['POSTGRES', 'BOTH'] %}
|
||||
|
||||
# Provision the shared group role and schema once. Every per-minion role is a
|
||||
# member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf
|
||||
# (via options='-c role=so_telegraf' in the connection string) so tables created
|
||||
# on first write are owned by the group role and every member can INSERT/SELECT.
|
||||
postgres_telegraf_group_role:
|
||||
cmd.run:
|
||||
- name: |
|
||||
docker exec -i so-postgres psql -v ON_ERROR_STOP=1 -U postgres -d so_telegraf <<'EOSQL'
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'so_telegraf') THEN
|
||||
CREATE ROLE so_telegraf NOLOGIN;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
GRANT CONNECT ON DATABASE so_telegraf TO so_telegraf;
|
||||
CREATE SCHEMA IF NOT EXISTS telegraf AUTHORIZATION so_telegraf;
|
||||
GRANT USAGE, CREATE ON SCHEMA telegraf TO so_telegraf;
|
||||
EOSQL
|
||||
- require:
|
||||
- docker_container: so-postgres
|
||||
|
||||
{% set users = salt['pillar.get']('postgres:auth:users', {}) %}
|
||||
{% for key, entry in users.items() %}
|
||||
{% if key.startswith('telegraf_') and entry.get('user') and entry.get('pass') %}
|
||||
@@ -30,10 +52,10 @@ postgres_telegraf_role_{{ u }}:
|
||||
END
|
||||
$$;
|
||||
GRANT CONNECT ON DATABASE so_telegraf TO "{{ u }}";
|
||||
CREATE SCHEMA IF NOT EXISTS "{{ u }}" AUTHORIZATION "{{ u }}";
|
||||
GRANT so_telegraf TO "{{ u }}";
|
||||
EOSQL
|
||||
- require:
|
||||
- docker_container: so-postgres
|
||||
- cmd: postgres_telegraf_group_role
|
||||
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
@@ -1,145 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
|
||||
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
|
||||
# https://securityonion.net/license; you may not use this file except in compliance with the
|
||||
# Elastic License 2.0.
|
||||
|
||||
# Point-in-time host metrics from the Telegraf Postgres backend.
|
||||
# Sanity-check tool for verifying metrics are landing before the grid
|
||||
# dashboards consume them.
|
||||
|
||||
. /usr/sbin/so-common
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [minion_id]
|
||||
|
||||
Shows the most recent CPU, memory, disk, and load metrics for each minion
|
||||
from the so_telegraf Postgres database. Without an argument, reports on
|
||||
every minion that has data. With a minion_id, limits output to that one.
|
||||
|
||||
Requires: sudo, so-postgres running, global.telegraf_output set to
|
||||
POSTGRES or BOTH.
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
echo "This script must be run using sudo!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "${1:-}" in
|
||||
-h|--help) usage ;;
|
||||
esac
|
||||
|
||||
FILTER_MINION="${1:-}"
|
||||
|
||||
so_psql() {
|
||||
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
|
||||
}
|
||||
|
||||
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
|
||||
echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# List telegraf schemas (role-per-minion naming convention: so_telegraf_<sanitized_minion_id>)
|
||||
SCHEMAS=$(so_psql -c "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE 'so_telegraf_%' ORDER BY schema_name;")
|
||||
|
||||
if [ -z "$SCHEMAS" ]; then
|
||||
echo "No minion schemas found in so_telegraf."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
print_metric() {
|
||||
local schema="$1" table="$2" query="$3"
|
||||
# Confirm table exists in this schema before querying
|
||||
local exists
|
||||
exists=$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${schema}' AND table_name='${table}' LIMIT 1;")
|
||||
[ -z "$exists" ] && return 0
|
||||
so_psql -c "$query"
|
||||
}
|
||||
|
||||
# Telegraf's postgresql output stores tag values either as individual columns
|
||||
# on the <metric>_tag table or as a single JSONB "tags" column, depending on
|
||||
# plugin version. Returns a SQL expression that extracts the named tag
|
||||
# regardless of layout. Empty string if the tag table doesn't exist.
|
||||
tag_expr() {
|
||||
local schema="$1" table="$2" tag="$3" alias="$4"
|
||||
local has_col
|
||||
has_col=$(so_psql -c "
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_schema='${schema}' AND table_name='${table}_tag' AND column_name='${tag}'
|
||||
LIMIT 1;")
|
||||
if [ -n "$has_col" ]; then
|
||||
echo "${alias}.${tag}"
|
||||
return
|
||||
fi
|
||||
local has_tags
|
||||
has_tags=$(so_psql -c "
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_schema='${schema}' AND table_name='${table}_tag' AND column_name='tags'
|
||||
LIMIT 1;")
|
||||
if [ -n "$has_tags" ]; then
|
||||
echo "(${alias}.tags->>'${tag}')"
|
||||
return
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
for schema in $SCHEMAS; do
|
||||
minion="${schema#so_telegraf_}"
|
||||
if [ -n "$FILTER_MINION" ]; then
|
||||
# Compare against the sanitized form used in schema names
|
||||
want=$(echo "$FILTER_MINION" | tr '.-' '_' | tr '[:upper:]' '[:lower:]')
|
||||
[ "$minion" != "$want" ] && continue
|
||||
fi
|
||||
|
||||
echo "===================================================================="
|
||||
echo " Minion: $minion"
|
||||
echo "===================================================================="
|
||||
|
||||
cpu_tag=$(tag_expr "$schema" "cpu" "cpu" "t")
|
||||
if [ -n "$cpu_tag" ]; then
|
||||
print_metric "$schema" "cpu" "
|
||||
SELECT 'cpu ' AS metric,
|
||||
to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round((100 - c.usage_idle)::numeric, 1) || '% used'
|
||||
FROM \"${schema}\".cpu c
|
||||
JOIN \"${schema}\".cpu_tag t USING (tag_id)
|
||||
WHERE ${cpu_tag} = 'cpu-total'
|
||||
ORDER BY c.time DESC LIMIT 1;"
|
||||
fi
|
||||
|
||||
print_metric "$schema" "mem" "
|
||||
SELECT 'memory ' AS metric,
|
||||
to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round(m.used_percent::numeric, 1) || '% used (' ||
|
||||
pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')'
|
||||
FROM \"${schema}\".mem m
|
||||
ORDER BY m.time DESC LIMIT 1;"
|
||||
|
||||
disk_path=$(tag_expr "$schema" "disk" "path" "t")
|
||||
if [ -n "$disk_path" ]; then
|
||||
print_metric "$schema" "disk" "
|
||||
SELECT 'disk ' || rpad(${disk_path}, 12) AS metric,
|
||||
to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round(d.used_percent::numeric, 1) || '% used (' ||
|
||||
pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')'
|
||||
FROM \"${schema}\".disk d
|
||||
JOIN \"${schema}\".disk_tag t USING (tag_id)
|
||||
WHERE d.time = (SELECT max(time) FROM \"${schema}\".disk)
|
||||
ORDER BY ${disk_path};"
|
||||
fi
|
||||
|
||||
print_metric "$schema" "system" "
|
||||
SELECT 'load ' AS metric,
|
||||
to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)'
|
||||
FROM \"${schema}\".system s
|
||||
ORDER BY s.time DESC LIMIT 1;"
|
||||
|
||||
echo ""
|
||||
done
|
||||
@@ -0,0 +1,170 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
|
||||
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
|
||||
# https://securityonion.net/license; you may not use this file except in compliance with the
|
||||
# Elastic License 2.0.
|
||||
|
||||
# Point-in-time host metrics from the Telegraf Postgres backend.
|
||||
# Sanity-check tool for verifying metrics are landing before the grid
|
||||
# dashboards consume them.
|
||||
|
||||
. /usr/sbin/so-common
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [minion_id]
|
||||
|
||||
Shows the most recent CPU, memory, disk, and load metrics for each minion
|
||||
from the so_telegraf Postgres database. Without an argument, reports on
|
||||
every minion that has data. With a minion_id, limits output to that one.
|
||||
|
||||
Requires: sudo, so-postgres running, global.telegraf_output set to
|
||||
POSTGRES or BOTH.
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
echo "This script must be run using sudo!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "${1:-}" in
|
||||
-h|--help) usage ;;
|
||||
esac
|
||||
|
||||
FILTER_HOST="${1:-}"
|
||||
SCHEMA="telegraf"
|
||||
|
||||
so_psql() {
|
||||
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
|
||||
}
|
||||
|
||||
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
|
||||
echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Telegraf's postgresql output stores tag values either as individual columns
|
||||
# on the <metric>_tag table or as a single JSONB "tags" column, depending on
|
||||
# plugin version. Returns a SQL expression that extracts the named tag
|
||||
# regardless of layout. Empty string if the tag table doesn't exist.
|
||||
tag_expr() {
|
||||
local table="$1" tag="$2" alias="$3"
|
||||
local has_col
|
||||
has_col=$(so_psql -c "
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='${tag}'
|
||||
LIMIT 1;")
|
||||
if [ -n "$has_col" ]; then
|
||||
echo "${alias}.${tag}"
|
||||
return
|
||||
fi
|
||||
local has_tags
|
||||
has_tags=$(so_psql -c "
|
||||
SELECT 1 FROM information_schema.columns
|
||||
WHERE table_schema='${SCHEMA}' AND table_name='${table}_tag' AND column_name='tags'
|
||||
LIMIT 1;")
|
||||
if [ -n "$has_tags" ]; then
|
||||
echo "(${alias}.tags->>'${tag}')"
|
||||
return
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
table_exists() {
|
||||
local table="$1"
|
||||
[ -n "$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${SCHEMA}' AND table_name='${table}' LIMIT 1;")" ]
|
||||
}
|
||||
|
||||
# Discover hosts from cpu_tag (every minion reports cpu).
|
||||
host_expr=$(tag_expr "cpu" "host" "t")
|
||||
if [ -z "$host_expr" ]; then
|
||||
echo "Unable to determine host tag column on ${SCHEMA}.cpu_tag. Has Telegraf written any rows yet?"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
HOSTS=$(so_psql -c "
|
||||
SELECT DISTINCT ${host_expr}
|
||||
FROM \"${SCHEMA}\".cpu_tag t
|
||||
WHERE ${host_expr} IS NOT NULL
|
||||
ORDER BY 1;")
|
||||
|
||||
if [ -z "$HOSTS" ]; then
|
||||
echo "No hosts found in ${SCHEMA}. Is Telegraf configured to write to Postgres?"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
print_metric() {
|
||||
local query="$1"
|
||||
so_psql -c "$query"
|
||||
}
|
||||
|
||||
for host in $HOSTS; do
|
||||
if [ -n "$FILTER_HOST" ] && [ "$host" != "$FILTER_HOST" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "===================================================================="
|
||||
echo " Host: $host"
|
||||
echo "===================================================================="
|
||||
|
||||
cpu_host=$(tag_expr "cpu" "host" "t")
|
||||
cpu_tag=$(tag_expr "cpu" "cpu" "t")
|
||||
if [ -n "$cpu_host" ] && [ -n "$cpu_tag" ]; then
|
||||
print_metric "
|
||||
SELECT 'cpu ' AS metric,
|
||||
to_char(c.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round((100 - c.usage_idle)::numeric, 1) || '% used'
|
||||
FROM \"${SCHEMA}\".cpu c
|
||||
JOIN \"${SCHEMA}\".cpu_tag t USING (tag_id)
|
||||
WHERE ${cpu_host} = '${host}' AND ${cpu_tag} = 'cpu-total'
|
||||
ORDER BY c.time DESC LIMIT 1;"
|
||||
fi
|
||||
|
||||
mem_host=$(tag_expr "mem" "host" "t")
|
||||
if [ -n "$mem_host" ] && table_exists "mem"; then
|
||||
print_metric "
|
||||
SELECT 'memory ' AS metric,
|
||||
to_char(m.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round(m.used_percent::numeric, 1) || '% used (' ||
|
||||
pg_size_pretty(m.used) || ' of ' || pg_size_pretty(m.total) || ')'
|
||||
FROM \"${SCHEMA}\".mem m
|
||||
JOIN \"${SCHEMA}\".mem_tag t USING (tag_id)
|
||||
WHERE ${mem_host} = '${host}'
|
||||
ORDER BY m.time DESC LIMIT 1;"
|
||||
fi
|
||||
|
||||
disk_host=$(tag_expr "disk" "host" "t")
|
||||
disk_path=$(tag_expr "disk" "path" "t")
|
||||
if [ -n "$disk_host" ] && [ -n "$disk_path" ] && table_exists "disk"; then
|
||||
print_metric "
|
||||
SELECT 'disk ' || rpad(${disk_path}, 12) AS metric,
|
||||
to_char(d.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round(d.used_percent::numeric, 1) || '% used (' ||
|
||||
pg_size_pretty(d.used) || ' of ' || pg_size_pretty(d.total) || ')'
|
||||
FROM \"${SCHEMA}\".disk d
|
||||
JOIN \"${SCHEMA}\".disk_tag t USING (tag_id)
|
||||
WHERE ${disk_host} = '${host}'
|
||||
AND d.time = (SELECT max(d2.time)
|
||||
FROM \"${SCHEMA}\".disk d2
|
||||
JOIN \"${SCHEMA}\".disk_tag t2 USING (tag_id)
|
||||
WHERE ${disk_host/t./t2.} = '${host}')
|
||||
ORDER BY ${disk_path};"
|
||||
fi
|
||||
|
||||
sys_host=$(tag_expr "system" "host" "t")
|
||||
if [ -n "$sys_host" ] && table_exists "system"; then
|
||||
print_metric "
|
||||
SELECT 'load ' AS metric,
|
||||
to_char(s.time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
s.load1 || ' / ' || s.load5 || ' / ' || s.load15 || ' (1/5/15m)'
|
||||
FROM \"${SCHEMA}\".system s
|
||||
JOIN \"${SCHEMA}\".system_tag t USING (tag_id)
|
||||
WHERE ${sys_host} = '${host}'
|
||||
ORDER BY s.time DESC LIMIT 1;"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
done
|
||||
@@ -63,15 +63,15 @@ log "Trimming rows older than ${DAYS} days (dry_run=${DRY_RUN})."
|
||||
|
||||
TOTAL_DELETED=0
|
||||
|
||||
# One row per (schema, table) we might want to trim.
|
||||
# Column name is 'time' for all telegraf output plugin tables; skip metadata
|
||||
# tables (tag_* used for tags_as_foreign_keys).
|
||||
# Every metric table in the shared telegraf schema has a 'time' column.
|
||||
# Tag tables (<metric>_tag) don't, so filtering on the column presence is
|
||||
# enough to scope the trim to metric tables only.
|
||||
ROWS=$(so_psql -c "
|
||||
SELECT table_schema || '.' || table_name
|
||||
FROM information_schema.columns
|
||||
WHERE column_name = 'time'
|
||||
AND data_type IN ('timestamp with time zone', 'timestamp without time zone')
|
||||
AND table_schema LIKE 'so_telegraf_%'
|
||||
AND table_schema = 'telegraf'
|
||||
ORDER BY 1;")
|
||||
|
||||
if [ -z "$ROWS" ]; then
|
||||
|
||||
Reference in New Issue
Block a user