# Telegraf Configuration {%- set INFLUXDBHOST = GLOBALS.influxdb_host %} {%- set ES_USER = salt['pillar.get']('elasticsearch:auth:users:so_elastic_user:user', '') %} {%- set ES_PASS = salt['pillar.get']('elasticsearch:auth:users:so_elastic_user:pass', '') %} {%- set TOKEN = salt['pillar.get']('influxdb:token', '') %} {%- set NODEIP = GLOBALS.node_ip %} {%- set UNIQUEID = salt['pillar.get']('sensor:uniqueid', '') %} {%- set ZEEK_ENABLED = salt['pillar.get']('zeek:enabled', True) %} {%- set MDENGINE = GLOBALS.md_engine %} {%- set LOGSTASH_ENABLED = LOGSTASH_MERGED.enabled %} {%- set TG_OUT = TELEGRAFMERGED.output | upper %} {%- set PG_HOST = GLOBALS.manager_ip %} {#- Per-minion telegraf creds live in the grid-wide telegraf/creds.sls pillar, written by /usr/sbin/so-telegraf-cred on the manager. Each minion looks up its own entry by grains.id. #} {%- set PG_ENTRY = salt['pillar.get']('telegraf:postgres_creds:' ~ grains.id, {}) %} {%- set PG_USER = PG_ENTRY.get('user', '') %} {%- set PG_PASS = PG_ENTRY.get('pass', '') %} # Global tags can be specified here in key="value" format. [global_tags] role = "{{ GLOBALS.role.split('-') | last }}" # Configuration for telegraf agent [agent] ## Default data collection interval for all inputs interval = "{{ TELEGRAFMERGED.config.interval }}" ## Rounds collection interval to 'interval' ## ie, if interval="10s" then always collect on :00, :10, :20, etc. round_interval = true ## Telegraf will send metrics to outputs in batches of at most ## metric_batch_size metrics. ## This controls the size of writes that Telegraf sends to output plugins. metric_batch_size = {{ TELEGRAFMERGED.config.metric_batch_size }} ## For failed writes, telegraf will cache metric_buffer_limit metrics for each ## output, and will flush this buffer on a successful write. Oldest metrics ## are dropped first when this buffer fills. ## This buffer only fills when writes fail to output plugin(s). metric_buffer_limit = {{ TELEGRAFMERGED.config.metric_buffer_limit }} ## Collection jitter is used to jitter the collection by a random amount. ## Each plugin will sleep for a random time within jitter before collecting. ## This can be used to avoid many plugins querying things like sysfs at the ## same time, which can have a measurable effect on the system. collection_jitter = "{{ TELEGRAFMERGED.config.collection_jitter }}" ## Default flushing interval for all outputs. Maximum flush_interval will be ## flush_interval + flush_jitter flush_interval = "{{ TELEGRAFMERGED.config.flush_interval }}" ## Jitter the flush interval by a random amount. This is primarily to avoid ## large write spikes for users running a large number of telegraf instances. ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s flush_jitter = "{{ TELEGRAFMERGED.config.flush_jitter }}" ## By default or when set to "0s", precision will be set to the same ## timestamp order as the collection interval, with the maximum being 1s. ## ie, when interval = "10s", precision will be "1s" ## when interval = "250ms", precision will be "1ms" ## Precision will NOT be used for service inputs. It is up to each individual ## service input to set the timestamp at the appropriate precision. ## Valid time units are "ns", "us" (or "µs"), "ms", "s". precision = "" ## Logging configuration: ## Run telegraf with debug log messages. debug = {{ 'true' if TELEGRAFMERGED.config.debug else 'false' }} ## Run telegraf in quiet mode (error log messages only). quiet = {{ 'true' if TELEGRAFMERGED.config.quiet else 'false'}} ## Specify the log file name. The empty string means to log to stderr. logfile = "/var/log/telegraf/telegraf.log" ## Override default hostname, if empty use os.Hostname() hostname = "{{ GLOBALS.hostname | lower }}" ## If set to true, do no set the "host" tag in the telegraf agent. omit_hostname = false ############################################################################### # OUTPUT PLUGINS # ############################################################################### {%- if TG_OUT in ['INFLUXDB', 'BOTH'] %} # Configuration for sending metrics to InfluxDB [[outputs.influxdb_v2]] urls = ["https://{{ INFLUXDBHOST }}:8086"] token = "{{ TOKEN }}" organization = "Security Onion" bucket = "telegraf/so_short_term" ## Optional TLS Config for use on HTTP connections. tls_ca = "/etc/telegraf/ca.crt" tls_cert = "/etc/telegraf/telegraf.crt" tls_key = "/etc/telegraf/telegraf.key" ## Use TLS but skip chain & host verification # insecure_skip_verify = false {%- endif %} {%- if TG_OUT in ['POSTGRES', 'BOTH'] and PG_USER and PG_PASS %} # Configuration for sending metrics to PostgreSQL. # options='-c role=so_telegraf' makes every connection SET ROLE to the shared # group role so tables created on first write are owned by so_telegraf, and # all per-minion members can INSERT/SELECT them via role inheritance. # fields_as_jsonb/tags_as_jsonb keep metric tables at a fixed column count so # high-cardinality inputs (docker, procstat, kafka) don't blow past the # Postgres 1600-column-per-table limit. [[outputs.postgresql]] connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'" schema = "telegraf" tags_as_foreign_keys = true tags_as_jsonb = true fields_as_jsonb = true # Every metric table is a daily time-range partitioned parent managed by # pg_partman. Retention drops old partitions instead of row-by-row DELETEs. {% raw %} # pg_partman 5.x requires the control column (time) to be NOT NULL, so # ALTER it before create_parent(). And create_parent() splits # p_parent_table on '.' to look up raw identifiers, so the literal must # be 'schema.name' (not '"schema"."name"' as .table|quoteLiteral emits). # IF NOT EXISTS keeps the three templates idempotent so a Telegraf # restart after any DB-side surgery re-runs them safely. create_templates = [ '''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}) PARTITION BY RANGE ("time")''', '''ALTER TABLE {{ .table }} ALTER COLUMN "time" SET NOT NULL''', '''SELECT partman.create_parent(p_parent_table := {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3) WHERE NOT EXISTS (SELECT 1 FROM partman.part_config WHERE parent_table = {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }})''' ] tag_table_create_templates = [ '''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}, PRIMARY KEY (tag_id))''' ] {% endraw %} {%- endif %} ############################################################################### # PROCESSOR PLUGINS # ############################################################################### ############################################################################### # AGGREGATOR PLUGINS # ############################################################################### ############################################################################### # INPUT PLUGINS # ############################################################################### # Read metrics about cpu usage [[inputs.cpu]] ## Whether to report per-cpu stats or not percpu = true ## Whether to report total system cpu stats or not totalcpu = true ## If true, collect raw CPU time metrics. collect_cpu_time = false ## If true, compute and report the sum of all non-idle CPU states. report_active = false # Read metrics about disk usage by mount point [[inputs.disk]] ## By default stats will be gathered for all mount points. ## Set mount_points will restrict the stats to only the specified mount points. mount_points = ["/host", "/host/nsm", "/host/var", "/host/var/log", "/host/var/log/audit", "/host/var/tmp" ] ## Ignore mount points by filesystem type. #ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"] # Read metrics about disk IO by device [[inputs.diskio]] ## By default, telegraf will gather stats for all devices including ## disk partitions. ## Setting devices will restrict the stats to the specified devices. # devices = ["sda", "sdb", "vd*"] ## Uncomment the following line if you need disk serial numbers. # skip_serial_number = false # ## On systems which support it, device metadata can be added in the form of ## tags. ## Currently only Linux is supported via udev properties. You can view ## available properties for a device by running: ## 'udevadm info -q property -n /dev/sda' # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] # ## Using the same metadata source as device_tags, you can also customize the ## name of the device via templates. ## The 'name_templates' parameter is a list of templates to try and apply to ## the device. The template may contain variables in the form of '$PROPERTY' or ## '${PROPERTY}'. The first template which does not contain any variables not ## present for the device is used as the device name tag. ## The typical use case is for LVM volumes, to get the VG/LV name instead of ## the near-meaningless DM-0 name. # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] # Get kernel statistics from /proc/stat [[inputs.kernel]] # no configuration # Read metrics about memory usage [[inputs.mem]] # no configuration # Get the number of processes and group them by status [[inputs.processes]] # no configuration # Read metrics about swap memory usage [[inputs.swap]] # no configuration # Read metrics about system load & uptime [[inputs.system]] # no configuration # # Collect bond interface status, slaves statuses and failures count [[inputs.bond]] # ## Sets 'proc' directory path # ## If not specified, then default is /proc # # host_proc = "/proc" # # ## By default, telegraf gather stats for all bond interfaces # ## Setting interfaces will restrict the stats to the specified # ## bond interfaces. # # bond_interfaces = ["bond0"] # # Read metrics about docker containers [[inputs.docker]] # ## Docker Endpoint # ## To use TCP, set endpoint = "tcp://[ip]:[port]" # ## To use environment variables (ie, docker-machine), set endpoint = "ENV" endpoint = "unix:///var/run/docker.sock" # # # Read stats from one or more Elasticsearch servers or clusters {%- if GLOBALS.is_manager or GLOBALS.role == 'so-heavynode' %} [[inputs.elasticsearch]] servers = ["https://{{ NODEIP }}:9200"] cluster_stats = true username = "{{ ES_USER }}" password = "{{ ES_PASS }}" insecure_skip_verify = true {%- elif grains['role'] in ['so-searchnode'] %} [[inputs.elasticsearch]] servers = ["https://{{ NODEIP }}:9200"] cluster_stats = false username = "{{ ES_USER }}" password = "{{ ES_PASS }}" insecure_skip_verify = true {%- endif %} # ## Timeout for HTTP requests to the elastic search server(s) # http_timeout = "5s" # # ## When local is true (the default), the node will read only its own stats. # ## Set local to false when you want to read the node stats from all nodes # ## of the cluster. # local = true # # ## Set cluster_health to true when you want to also obtain cluster health stats # cluster_health = false # # ## Adjust cluster_health_level when you want to also obtain detailed health stats # ## The options are # ## - indices (default) # ## - cluster # # cluster_health_level = "indices" # # ## Set cluster_stats to true when you want to also obtain cluster stats from the # ## Master node. # cluster_stats = false # # ## node_stats is a list of sub-stats that you want to have gathered. Valid options # ## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http", # ## "breaker". Per default, all stats are gathered. # # node_stats = ["jvm", "http"] # # ## Optional TLS Config # # tls_ca = "/etc/telegraf/ca.pem" # # tls_cert = "/etc/telegraf/cert.pem" # # tls_key = "/etc/telegraf/key.pem" # ## Use TLS but skip chain & host verification # # insecure_skip_verify = false {#- Fleet nodes do not have pillar access to logstash credentials #} {%- if LOGSTASH_ENABLED and grains.role != 'so-fleet' %} [[inputs.logstash]] url = "http://localhost:9600" collect = ["pipelines"] username = "{{ salt['pillar.get']('elasticsearch:auth:users:so_logstash_user:user') }}" password = "{{ salt['pillar.get']('elasticsearch:auth:users:so_logstash_user:pass') }}" {%- endif %} {% if grains.role in ['so-manager','so-managersearch','so-standalone','so-receiver'] and GLOBALS.pipeline == "KAFKA" -%} [[inputs.jolokia2_agent]] name_prefix= "kafka_" urls = ["http://{{ NODEIP }}:8778/jolokia"] [[inputs.jolokia2_agent.metric]] name = "topics" mbean = "kafka.server:name=*,type=BrokerTopicMetrics" field_prefix = "$1." [[inputs.jolokia2_agent.metric]] name = "topic" mbean = "kafka.server:name=*,topic=*,type=BrokerTopicMetrics" field_prefix = "$1." tag_keys = ["topic"] [[inputs.jolokia2_agent.metric]] name = "controller" mbean = "kafka.controller:name=*,type=*" field_prefix = "$1." [[inputs.jolokia2_agent.metric]] name = "partition" mbean = "kafka.log:name=*,partition=*,topic=*,type=Log" field_name = "$1" tag_keys = ["topic", "partition"] [[inputs.jolokia2_agent.metric]] name = "partition" mbean = "kafka.cluster:name=UnderReplicated,partition=*,topic=*,type=Partition" field_name = "UnderReplicatedPartitions" tag_keys = ["topic", "partition"] {%- endif %} # # Read metrics from one or more commands that can output to stdout {%- if 'sostatus.sh' in TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] %} {%- do TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]].remove('sostatus.sh') %} [[inputs.exec]] commands = [ "/scripts/sostatus.sh" ] data_format = "influx" timeout = "15s" interval = "60s" {%- endif %} {%- if TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] | length > 0 %} [[inputs.exec]] commands = [ {%- for script in TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] %} "/scripts/{{script}}"{% if not loop.last %},{% endif %} {%- endfor %} ] data_format = "influx" ## Timeout for each command to complete. timeout = "15s" {%- endif %} {%- if salt['pillar.get']('healthcheck:enabled', False) %} [[inputs.file]] files = ["/host/nsm/zeek/logs/zeek_restart.log"] data_format = "influx" {%- endif %} [[inputs.file]] files = ["/etc/telegraf/node_config.json"] name_override = "node_config" data_format = "json" interval = "5m" json_string_fields = ['manint', 'monint'] tag_keys = ['role'] # # Read metrics about network interface usage [[inputs.net]] # Scripts run every 30s||TELEGRAFMERGED.config.interval - ES index script doesn't need to run as frequently {%- if GLOBALS.is_manager or GLOBALS.role == 'so-heavynode' %} [[ inputs.exec ]] commands = [ "/scripts/esindexsize.sh" ] data_format = "influx" interval = "1h" timeout = "120s" {%- endif %}