mirror of
https://github.com/Security-Onion-Solutions/securityonion.git
synced 2026-05-09 21:02:36 +02:00
614f32c5e0
The old flow had two writers for each per-minion Telegraf password
(so-minion wrote the minion pillar; postgres.auth regenerated any
missing aggregate entries). They drifted on first-boot and there was
no trigger to create DB roles when a new minion joined.
Split responsibilities:
- pillar/postgres/auth.sls (manager-scoped) keeps only the so_postgres
admin cred.
- pillar/telegraf/creds.sls (grid-wide) holds a {minion_id: {user,
pass}} map, shadowed per-install by the local-pillar copy.
- salt/manager/tools/sbin/so-telegraf-cred is the single writer:
flock, atomic YAML write, PyYAML safe_dump so passwords never
round-trip through so-yaml.py's type coercion. Idempotent add, quiet
remove.
- so-minion's add/remove hooks now shell out to so-telegraf-cred
instead of editing pillar files directly.
- postgres.telegraf_users iterates the new pillar key and CREATE/ALTERs
roles from it; telegraf.conf reads its own entry via grains.id.
- orch.deploy_newnode runs postgres.telegraf_users on the manager and
refreshes the new minion's pillar before the new node highstates,
so the DB role is in place the first time telegraf tries to connect.
- soup's post_to_3.1.0 backfills the creds pillar from accepted salt
keys (idempotent) and runs postgres.telegraf_users once to reconcile
the DB.
383 lines
15 KiB
Plaintext
383 lines
15 KiB
Plaintext
# Telegraf Configuration
|
|
{%- set INFLUXDBHOST = GLOBALS.influxdb_host %}
|
|
{%- set ES_USER = salt['pillar.get']('elasticsearch:auth:users:so_elastic_user:user', '') %}
|
|
{%- set ES_PASS = salt['pillar.get']('elasticsearch:auth:users:so_elastic_user:pass', '') %}
|
|
{%- set TOKEN = salt['pillar.get']('influxdb:token', '') %}
|
|
{%- set NODEIP = GLOBALS.node_ip %}
|
|
{%- set UNIQUEID = salt['pillar.get']('sensor:uniqueid', '') %}
|
|
{%- set ZEEK_ENABLED = salt['pillar.get']('zeek:enabled', True) %}
|
|
{%- set MDENGINE = GLOBALS.md_engine %}
|
|
{%- set LOGSTASH_ENABLED = LOGSTASH_MERGED.enabled %}
|
|
{%- set TG_OUT = TELEGRAFMERGED.output | upper %}
|
|
{%- set PG_HOST = GLOBALS.manager_ip %}
|
|
{#- Per-minion telegraf creds live in the grid-wide telegraf/creds.sls pillar,
|
|
written by /usr/sbin/so-telegraf-cred on the manager. Each minion looks up
|
|
its own entry by grains.id. #}
|
|
{%- set PG_ENTRY = salt['pillar.get']('telegraf:postgres_creds:' ~ grains.id, {}) %}
|
|
{%- set PG_USER = PG_ENTRY.get('user', '') %}
|
|
{%- set PG_PASS = PG_ENTRY.get('pass', '') %}
|
|
# Global tags can be specified here in key="value" format.
|
|
[global_tags]
|
|
role = "{{ GLOBALS.role.split('-') | last }}"
|
|
|
|
# Configuration for telegraf agent
|
|
[agent]
|
|
## Default data collection interval for all inputs
|
|
interval = "{{ TELEGRAFMERGED.config.interval }}"
|
|
## Rounds collection interval to 'interval'
|
|
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
|
|
round_interval = true
|
|
|
|
## Telegraf will send metrics to outputs in batches of at most
|
|
## metric_batch_size metrics.
|
|
## This controls the size of writes that Telegraf sends to output plugins.
|
|
metric_batch_size = {{ TELEGRAFMERGED.config.metric_batch_size }}
|
|
|
|
## For failed writes, telegraf will cache metric_buffer_limit metrics for each
|
|
## output, and will flush this buffer on a successful write. Oldest metrics
|
|
## are dropped first when this buffer fills.
|
|
## This buffer only fills when writes fail to output plugin(s).
|
|
metric_buffer_limit = {{ TELEGRAFMERGED.config.metric_buffer_limit }}
|
|
|
|
## Collection jitter is used to jitter the collection by a random amount.
|
|
## Each plugin will sleep for a random time within jitter before collecting.
|
|
## This can be used to avoid many plugins querying things like sysfs at the
|
|
## same time, which can have a measurable effect on the system.
|
|
collection_jitter = "{{ TELEGRAFMERGED.config.collection_jitter }}"
|
|
|
|
## Default flushing interval for all outputs. Maximum flush_interval will be
|
|
## flush_interval + flush_jitter
|
|
flush_interval = "{{ TELEGRAFMERGED.config.flush_interval }}"
|
|
## Jitter the flush interval by a random amount. This is primarily to avoid
|
|
## large write spikes for users running a large number of telegraf instances.
|
|
## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
|
|
flush_jitter = "{{ TELEGRAFMERGED.config.flush_jitter }}"
|
|
|
|
## By default or when set to "0s", precision will be set to the same
|
|
## timestamp order as the collection interval, with the maximum being 1s.
|
|
## ie, when interval = "10s", precision will be "1s"
|
|
## when interval = "250ms", precision will be "1ms"
|
|
## Precision will NOT be used for service inputs. It is up to each individual
|
|
## service input to set the timestamp at the appropriate precision.
|
|
## Valid time units are "ns", "us" (or "µs"), "ms", "s".
|
|
precision = ""
|
|
|
|
## Logging configuration:
|
|
## Run telegraf with debug log messages.
|
|
debug = {{ 'true' if TELEGRAFMERGED.config.debug else 'false' }}
|
|
## Run telegraf in quiet mode (error log messages only).
|
|
quiet = {{ 'true' if TELEGRAFMERGED.config.quiet else 'false'}}
|
|
## Specify the log file name. The empty string means to log to stderr.
|
|
logfile = "/var/log/telegraf/telegraf.log"
|
|
|
|
## Override default hostname, if empty use os.Hostname()
|
|
hostname = "{{ GLOBALS.hostname | lower }}"
|
|
## If set to true, do no set the "host" tag in the telegraf agent.
|
|
omit_hostname = false
|
|
|
|
|
|
###############################################################################
|
|
# OUTPUT PLUGINS #
|
|
###############################################################################
|
|
|
|
{%- if TG_OUT in ['INFLUXDB', 'BOTH'] %}
|
|
# Configuration for sending metrics to InfluxDB
|
|
[[outputs.influxdb_v2]]
|
|
urls = ["https://{{ INFLUXDBHOST }}:8086"]
|
|
token = "{{ TOKEN }}"
|
|
organization = "Security Onion"
|
|
bucket = "telegraf/so_short_term"
|
|
|
|
## Optional TLS Config for use on HTTP connections.
|
|
tls_ca = "/etc/telegraf/ca.crt"
|
|
tls_cert = "/etc/telegraf/telegraf.crt"
|
|
tls_key = "/etc/telegraf/telegraf.key"
|
|
## Use TLS but skip chain & host verification
|
|
# insecure_skip_verify = false
|
|
{%- endif %}
|
|
|
|
{%- if TG_OUT in ['POSTGRES', 'BOTH'] and PG_USER and PG_PASS %}
|
|
# Configuration for sending metrics to PostgreSQL.
|
|
# options='-c role=so_telegraf' makes every connection SET ROLE to the shared
|
|
# group role so tables created on first write are owned by so_telegraf, and
|
|
# all per-minion members can INSERT/SELECT them via role inheritance.
|
|
# fields_as_jsonb/tags_as_jsonb keep metric tables at a fixed column count so
|
|
# high-cardinality inputs (docker, procstat, kafka) don't blow past the
|
|
# Postgres 1600-column-per-table limit.
|
|
[[outputs.postgresql]]
|
|
connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'"
|
|
schema = "telegraf"
|
|
tags_as_foreign_keys = true
|
|
tags_as_jsonb = true
|
|
fields_as_jsonb = true
|
|
# Every metric table is a daily time-range partitioned parent managed by
|
|
# pg_partman. Retention drops old partitions instead of row-by-row DELETEs.
|
|
{% raw %}
|
|
# pg_partman 5.x requires the control column (time) to be NOT NULL, so
|
|
# ALTER it before create_parent(). And create_parent() splits
|
|
# p_parent_table on '.' to look up raw identifiers, so the literal must
|
|
# be 'schema.name' (not '"schema"."name"' as .table|quoteLiteral emits).
|
|
# IF NOT EXISTS keeps the three templates idempotent so a Telegraf
|
|
# restart after any DB-side surgery re-runs them safely.
|
|
create_templates = [
|
|
'''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}) PARTITION BY RANGE ("time")''',
|
|
'''ALTER TABLE {{ .table }} ALTER COLUMN "time" SET NOT NULL''',
|
|
'''SELECT partman.create_parent(p_parent_table := {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3) WHERE NOT EXISTS (SELECT 1 FROM partman.part_config WHERE parent_table = {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }})'''
|
|
]
|
|
tag_table_create_templates = [
|
|
'''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}, PRIMARY KEY (tag_id))'''
|
|
]
|
|
{% endraw %}
|
|
{%- endif %}
|
|
|
|
###############################################################################
|
|
# PROCESSOR PLUGINS #
|
|
###############################################################################
|
|
|
|
###############################################################################
|
|
# AGGREGATOR PLUGINS #
|
|
###############################################################################
|
|
|
|
###############################################################################
|
|
# INPUT PLUGINS #
|
|
###############################################################################
|
|
|
|
# Read metrics about cpu usage
|
|
[[inputs.cpu]]
|
|
## Whether to report per-cpu stats or not
|
|
percpu = true
|
|
## Whether to report total system cpu stats or not
|
|
totalcpu = true
|
|
## If true, collect raw CPU time metrics.
|
|
collect_cpu_time = false
|
|
## If true, compute and report the sum of all non-idle CPU states.
|
|
report_active = false
|
|
|
|
|
|
# Read metrics about disk usage by mount point
|
|
[[inputs.disk]]
|
|
## By default stats will be gathered for all mount points.
|
|
## Set mount_points will restrict the stats to only the specified mount points.
|
|
mount_points = ["/host",
|
|
"/host/nsm",
|
|
"/host/var",
|
|
"/host/var/log",
|
|
"/host/var/log/audit",
|
|
"/host/var/tmp"
|
|
]
|
|
|
|
## Ignore mount points by filesystem type.
|
|
#ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"]
|
|
|
|
|
|
# Read metrics about disk IO by device
|
|
[[inputs.diskio]]
|
|
## By default, telegraf will gather stats for all devices including
|
|
## disk partitions.
|
|
## Setting devices will restrict the stats to the specified devices.
|
|
# devices = ["sda", "sdb", "vd*"]
|
|
## Uncomment the following line if you need disk serial numbers.
|
|
# skip_serial_number = false
|
|
#
|
|
## On systems which support it, device metadata can be added in the form of
|
|
## tags.
|
|
## Currently only Linux is supported via udev properties. You can view
|
|
## available properties for a device by running:
|
|
## 'udevadm info -q property -n /dev/sda'
|
|
# device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"]
|
|
#
|
|
## Using the same metadata source as device_tags, you can also customize the
|
|
## name of the device via templates.
|
|
## The 'name_templates' parameter is a list of templates to try and apply to
|
|
## the device. The template may contain variables in the form of '$PROPERTY' or
|
|
## '${PROPERTY}'. The first template which does not contain any variables not
|
|
## present for the device is used as the device name tag.
|
|
## The typical use case is for LVM volumes, to get the VG/LV name instead of
|
|
## the near-meaningless DM-0 name.
|
|
# name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"]
|
|
|
|
|
|
# Get kernel statistics from /proc/stat
|
|
[[inputs.kernel]]
|
|
# no configuration
|
|
|
|
# Read metrics about memory usage
|
|
[[inputs.mem]]
|
|
# no configuration
|
|
|
|
# Get the number of processes and group them by status
|
|
[[inputs.processes]]
|
|
# no configuration
|
|
|
|
# Read metrics about swap memory usage
|
|
[[inputs.swap]]
|
|
# no configuration
|
|
|
|
# Read metrics about system load & uptime
|
|
[[inputs.system]]
|
|
# no configuration
|
|
|
|
# # Collect bond interface status, slaves statuses and failures count
|
|
[[inputs.bond]]
|
|
# ## Sets 'proc' directory path
|
|
# ## If not specified, then default is /proc
|
|
# # host_proc = "/proc"
|
|
#
|
|
# ## By default, telegraf gather stats for all bond interfaces
|
|
# ## Setting interfaces will restrict the stats to the specified
|
|
# ## bond interfaces.
|
|
# # bond_interfaces = ["bond0"]
|
|
|
|
# # Read metrics about docker containers
|
|
[[inputs.docker]]
|
|
# ## Docker Endpoint
|
|
# ## To use TCP, set endpoint = "tcp://[ip]:[port]"
|
|
# ## To use environment variables (ie, docker-machine), set endpoint = "ENV"
|
|
endpoint = "unix:///var/run/docker.sock"
|
|
#
|
|
|
|
# # Read stats from one or more Elasticsearch servers or clusters
|
|
{%- if GLOBALS.is_manager or GLOBALS.role == 'so-heavynode' %}
|
|
[[inputs.elasticsearch]]
|
|
servers = ["https://{{ NODEIP }}:9200"]
|
|
cluster_stats = true
|
|
username = "{{ ES_USER }}"
|
|
password = "{{ ES_PASS }}"
|
|
insecure_skip_verify = true
|
|
{%- elif grains['role'] in ['so-searchnode'] %}
|
|
[[inputs.elasticsearch]]
|
|
servers = ["https://{{ NODEIP }}:9200"]
|
|
cluster_stats = false
|
|
username = "{{ ES_USER }}"
|
|
password = "{{ ES_PASS }}"
|
|
insecure_skip_verify = true
|
|
{%- endif %}
|
|
|
|
# ## Timeout for HTTP requests to the elastic search server(s)
|
|
# http_timeout = "5s"
|
|
#
|
|
# ## When local is true (the default), the node will read only its own stats.
|
|
# ## Set local to false when you want to read the node stats from all nodes
|
|
# ## of the cluster.
|
|
# local = true
|
|
#
|
|
# ## Set cluster_health to true when you want to also obtain cluster health stats
|
|
# cluster_health = false
|
|
#
|
|
# ## Adjust cluster_health_level when you want to also obtain detailed health stats
|
|
# ## The options are
|
|
# ## - indices (default)
|
|
# ## - cluster
|
|
# # cluster_health_level = "indices"
|
|
#
|
|
# ## Set cluster_stats to true when you want to also obtain cluster stats from the
|
|
# ## Master node.
|
|
# cluster_stats = false
|
|
#
|
|
# ## node_stats is a list of sub-stats that you want to have gathered. Valid options
|
|
# ## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http",
|
|
# ## "breaker". Per default, all stats are gathered.
|
|
# # node_stats = ["jvm", "http"]
|
|
#
|
|
# ## Optional TLS Config
|
|
# # tls_ca = "/etc/telegraf/ca.pem"
|
|
# # tls_cert = "/etc/telegraf/cert.pem"
|
|
# # tls_key = "/etc/telegraf/key.pem"
|
|
# ## Use TLS but skip chain & host verification
|
|
# # insecure_skip_verify = false
|
|
|
|
{#- Fleet nodes do not have pillar access to logstash credentials #}
|
|
{%- if LOGSTASH_ENABLED and grains.role != 'so-fleet' %}
|
|
[[inputs.logstash]]
|
|
url = "http://localhost:9600"
|
|
collect = ["pipelines"]
|
|
username = "{{ salt['pillar.get']('elasticsearch:auth:users:so_logstash_user:user') }}"
|
|
password = "{{ salt['pillar.get']('elasticsearch:auth:users:so_logstash_user:pass') }}"
|
|
{%- endif %}
|
|
|
|
{% if grains.role in ['so-manager','so-managersearch','so-standalone','so-receiver'] and GLOBALS.pipeline == "KAFKA" -%}
|
|
[[inputs.jolokia2_agent]]
|
|
name_prefix= "kafka_"
|
|
urls = ["http://{{ NODEIP }}:8778/jolokia"]
|
|
|
|
[[inputs.jolokia2_agent.metric]]
|
|
name = "topics"
|
|
mbean = "kafka.server:name=*,type=BrokerTopicMetrics"
|
|
field_prefix = "$1."
|
|
|
|
[[inputs.jolokia2_agent.metric]]
|
|
name = "topic"
|
|
mbean = "kafka.server:name=*,topic=*,type=BrokerTopicMetrics"
|
|
field_prefix = "$1."
|
|
tag_keys = ["topic"]
|
|
|
|
[[inputs.jolokia2_agent.metric]]
|
|
name = "controller"
|
|
mbean = "kafka.controller:name=*,type=*"
|
|
field_prefix = "$1."
|
|
|
|
[[inputs.jolokia2_agent.metric]]
|
|
name = "partition"
|
|
mbean = "kafka.log:name=*,partition=*,topic=*,type=Log"
|
|
field_name = "$1"
|
|
tag_keys = ["topic", "partition"]
|
|
|
|
[[inputs.jolokia2_agent.metric]]
|
|
name = "partition"
|
|
mbean = "kafka.cluster:name=UnderReplicated,partition=*,topic=*,type=Partition"
|
|
field_name = "UnderReplicatedPartitions"
|
|
tag_keys = ["topic", "partition"]
|
|
|
|
{%- endif %}
|
|
# # Read metrics from one or more commands that can output to stdout
|
|
{%- if 'sostatus.sh' in TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] %}
|
|
{%- do TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]].remove('sostatus.sh') %}
|
|
[[inputs.exec]]
|
|
commands = [
|
|
"/scripts/sostatus.sh"
|
|
]
|
|
data_format = "influx"
|
|
timeout = "15s"
|
|
interval = "60s"
|
|
{%- endif %}
|
|
|
|
{%- if TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] | length > 0 %}
|
|
[[inputs.exec]]
|
|
commands = [
|
|
{%- for script in TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] %}
|
|
"/scripts/{{script}}"{% if not loop.last %},{% endif %}
|
|
{%- endfor %}
|
|
]
|
|
data_format = "influx"
|
|
## Timeout for each command to complete.
|
|
timeout = "15s"
|
|
{%- endif %}
|
|
|
|
{%- if salt['pillar.get']('healthcheck:enabled', False) %}
|
|
[[inputs.file]]
|
|
files = ["/host/nsm/zeek/logs/zeek_restart.log"]
|
|
data_format = "influx"
|
|
{%- endif %}
|
|
|
|
[[inputs.file]]
|
|
files = ["/etc/telegraf/node_config.json"]
|
|
name_override = "node_config"
|
|
data_format = "json"
|
|
interval = "5m"
|
|
json_string_fields = ['manint', 'monint']
|
|
tag_keys = ['role']
|
|
|
|
# # Read metrics about network interface usage
|
|
[[inputs.net]]
|
|
|
|
# Scripts run every 30s||TELEGRAFMERGED.config.interval - ES index script doesn't need to run as frequently
|
|
{%- if GLOBALS.is_manager or GLOBALS.role == 'so-heavynode' %}
|
|
[[ inputs.exec ]]
|
|
commands = [
|
|
"/scripts/esindexsize.sh"
|
|
]
|
|
data_format = "influx"
|
|
interval = "1h"
|
|
timeout = "120s"
|
|
{%- endif %}
|