mirror of
https://github.com/Security-Onion-Solutions/securityonion.git
synced 2026-05-11 05:40:32 +02:00
Add telegraf_output selector for InfluxDB/Postgres dual-write
Introduces global.telegraf_output (INFLUXDB|POSTGRES|BOTH, default BOTH) so Telegraf can write metrics to Postgres alongside or instead of InfluxDB. Each minion authenticates with its own so_telegraf_<minion> role and writes to a matching schema inside a shared so_telegraf database, keeping blast radius per-credential to that minion's data. - Per-minion credentials auto-generated and persisted in postgres/auth.sls - postgres/telegraf_users.sls reconciles roles/schemas on every apply - Firewall opens 5432 only to minion hostgroups when Postgres output is active - Reactor on salt/auth + orch/telegraf_postgres_sync.sls provision new minions automatically on key accept - soup post_to_3.1.0 backfills users for existing minions on upgrade - so-show-stats prints latest CPU/mem/disk/load per minion for sanity checks - so-telegraf-trim + nightly cron prune rows older than postgres.telegraf.retention_days (default 14)
This commit is contained in:
@@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
|
||||
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
|
||||
# https://securityonion.net/license; you may not use this file except in compliance with the
|
||||
# Elastic License 2.0.
|
||||
|
||||
# Point-in-time host metrics from the Telegraf Postgres backend.
|
||||
# Sanity-check tool for verifying metrics are landing before the grid
|
||||
# dashboards consume them.
|
||||
|
||||
. /usr/sbin/so-common
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [minion_id]
|
||||
|
||||
Shows the most recent CPU, memory, disk, and load metrics for each minion
|
||||
from the so_telegraf Postgres database. Without an argument, reports on
|
||||
every minion that has data. With a minion_id, limits output to that one.
|
||||
|
||||
Requires: sudo, so-postgres running, global.telegraf_output set to
|
||||
POSTGRES or BOTH.
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
echo "This script must be run using sudo!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "${1:-}" in
|
||||
-h|--help) usage ;;
|
||||
esac
|
||||
|
||||
FILTER_MINION="${1:-}"
|
||||
|
||||
so_psql() {
|
||||
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
|
||||
}
|
||||
|
||||
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
|
||||
echo "Database so_telegraf not found. Is global.telegraf_output set to POSTGRES or BOTH?"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# List telegraf schemas (role-per-minion naming convention: so_telegraf_<sanitized_minion_id>)
|
||||
SCHEMAS=$(so_psql -c "SELECT schema_name FROM information_schema.schemata WHERE schema_name LIKE 'so_telegraf_%' ORDER BY schema_name;")
|
||||
|
||||
if [ -z "$SCHEMAS" ]; then
|
||||
echo "No minion schemas found in so_telegraf."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
print_metric() {
|
||||
local schema="$1" table="$2" query="$3"
|
||||
# Confirm table exists in this schema before querying
|
||||
local exists
|
||||
exists=$(so_psql -c "SELECT 1 FROM information_schema.tables WHERE table_schema='${schema}' AND table_name='${table}' LIMIT 1;")
|
||||
[ -z "$exists" ] && return 0
|
||||
so_psql -c "$query"
|
||||
}
|
||||
|
||||
for schema in $SCHEMAS; do
|
||||
minion="${schema#so_telegraf_}"
|
||||
if [ -n "$FILTER_MINION" ]; then
|
||||
# Compare against the sanitized form used in schema names
|
||||
want=$(echo "$FILTER_MINION" | tr '.-' '_' | tr '[:upper:]' '[:lower:]')
|
||||
[ "$minion" != "$want" ] && continue
|
||||
fi
|
||||
|
||||
echo "===================================================================="
|
||||
echo " Minion: $minion"
|
||||
echo "===================================================================="
|
||||
|
||||
print_metric "$schema" "cpu" "
|
||||
SELECT 'cpu ' AS metric,
|
||||
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round((100 - usage_idle)::numeric, 1) || '% used'
|
||||
FROM \"${schema}\".cpu
|
||||
WHERE cpu = 'cpu-total'
|
||||
ORDER BY time DESC LIMIT 1;"
|
||||
|
||||
print_metric "$schema" "mem" "
|
||||
SELECT 'memory ' AS metric,
|
||||
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round(used_percent::numeric, 1) || '% used (' ||
|
||||
pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')'
|
||||
FROM \"${schema}\".mem
|
||||
ORDER BY time DESC LIMIT 1;"
|
||||
|
||||
print_metric "$schema" "disk" "
|
||||
SELECT 'disk ' || rpad(path, 8) AS metric,
|
||||
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
round(used_percent::numeric, 1) || '% used (' ||
|
||||
pg_size_pretty(used) || ' of ' || pg_size_pretty(total) || ')'
|
||||
FROM \"${schema}\".disk
|
||||
WHERE time = (SELECT max(time) FROM \"${schema}\".disk)
|
||||
ORDER BY path;"
|
||||
|
||||
print_metric "$schema" "system" "
|
||||
SELECT 'load ' AS metric,
|
||||
to_char(time, 'YYYY-MM-DD HH24:MI:SS') AS ts,
|
||||
load1 || ' / ' || load5 || ' / ' || load15 || ' (1/5/15m)'
|
||||
FROM \"${schema}\".system
|
||||
ORDER BY time DESC LIMIT 1;"
|
||||
|
||||
echo ""
|
||||
done
|
||||
@@ -0,0 +1,103 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
|
||||
# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at
|
||||
# https://securityonion.net/license; you may not use this file except in compliance with the
|
||||
# Elastic License 2.0.
|
||||
|
||||
# Deletes Telegraf metric rows older than the configured retention window from
|
||||
# every minion schema in the so_telegraf database. Intended to run daily from
|
||||
# cron. Retention comes from pillar (postgres.telegraf.retention_days),
|
||||
# defaulting to 14 days. An explicit --days argument overrides the pillar.
|
||||
|
||||
. /usr/sbin/so-common
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [--days N] [--dry-run]
|
||||
|
||||
--days N Override retention in days (default: pillar
|
||||
postgres.telegraf.retention_days, fallback 14)
|
||||
--dry-run Report what would be deleted without modifying anything
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
echo "This script must be run using sudo!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
DAYS=""
|
||||
DRY_RUN=0
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--days) DAYS="$2"; shift 2 ;;
|
||||
--dry-run) DRY_RUN=1; shift ;;
|
||||
-h|--help) usage ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -z "$DAYS" ]; then
|
||||
DAYS=$(salt-call --local --out=newline_values_only pillar.get postgres:telegraf:retention_days 2>/dev/null)
|
||||
fi
|
||||
if ! [[ "$DAYS" =~ ^[0-9]+$ ]] || [ "$DAYS" -lt 1 ]; then
|
||||
DAYS=14
|
||||
fi
|
||||
|
||||
log() {
|
||||
echo "$(date '+%Y-%m-%d %H:%M:%S') so-telegraf-trim: $*"
|
||||
}
|
||||
|
||||
so_psql() {
|
||||
docker exec so-postgres psql -U postgres -d so_telegraf -At -F $'\t' "$@"
|
||||
}
|
||||
|
||||
if ! docker exec so-postgres psql -U postgres -lqt 2>/dev/null | cut -d\| -f1 | grep -qw so_telegraf; then
|
||||
log "Database so_telegraf not present; nothing to trim."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "Trimming rows older than ${DAYS} days (dry_run=${DRY_RUN})."
|
||||
|
||||
TOTAL_DELETED=0
|
||||
|
||||
# One row per (schema, table) we might want to trim.
|
||||
# Column name is 'time' for all telegraf output plugin tables; skip metadata
|
||||
# tables (tag_* used for tags_as_foreign_keys).
|
||||
ROWS=$(so_psql -c "
|
||||
SELECT table_schema || '.' || table_name
|
||||
FROM information_schema.columns
|
||||
WHERE column_name = 'time'
|
||||
AND data_type IN ('timestamp with time zone', 'timestamp without time zone')
|
||||
AND table_schema LIKE 'so_telegraf_%'
|
||||
ORDER BY 1;")
|
||||
|
||||
if [ -z "$ROWS" ]; then
|
||||
log "No telegraf metric tables found."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for qualified in $ROWS; do
|
||||
if [ "$DRY_RUN" -eq 1 ]; then
|
||||
count=$(so_psql -c "SELECT count(*) FROM \"${qualified%.*}\".\"${qualified#*.}\" WHERE time < now() - interval '${DAYS} days';")
|
||||
log "would delete ${count:-0} rows from ${qualified}"
|
||||
else
|
||||
# RETURNING count via a CTE so we can log how much was trimmed per table
|
||||
deleted=$(so_psql -c "
|
||||
WITH d AS (
|
||||
DELETE FROM \"${qualified%.*}\".\"${qualified#*.}\"
|
||||
WHERE time < now() - interval '${DAYS} days'
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT count(*) FROM d;")
|
||||
deleted=${deleted:-0}
|
||||
TOTAL_DELETED=$((TOTAL_DELETED + deleted))
|
||||
[ "$deleted" -gt 0 ] && log "deleted ${deleted} rows from ${qualified}"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$DRY_RUN" -eq 0 ]; then
|
||||
log "Trim complete. Total rows deleted: ${TOTAL_DELETED}."
|
||||
fi
|
||||
Reference in New Issue
Block a user