securityonion/salt/telegraf/etc/telegraf.conf

# Telegraf Configuration
{%- set INFLUXDBHOST = GLOBALS.influxdb_host %}
{%- set ES_USER = salt['pillar.get']('elasticsearch:auth:users:so_elastic_user:user', '') %}
{%- set ES_PASS = salt['pillar.get']('elasticsearch:auth:users:so_elastic_user:pass', '') %}
{%- set TOKEN = salt['pillar.get']('influxdb:token', '') %}
{%- set NODEIP = GLOBALS.node_ip %}
{%- set UNIQUEID = salt['pillar.get']('sensor:uniqueid', '') %}
{%- set ZEEK_ENABLED = salt['pillar.get']('zeek:enabled', True) %}
{%- set MDENGINE = GLOBALS.md_engine %}
{%- set LOGSTASH_ENABLED = LOGSTASH_MERGED.enabled %}
{%- set TG_OUT = TELEGRAFMERGED.output | upper %}
{%- set PG_HOST = GLOBALS.manager_ip %}
{#- Per-minion telegraf creds live in the grid-wide telegraf/creds.sls pillar,
    written by /usr/sbin/so-telegraf-cred on the manager. Each minion looks up
    its own entry by grains.id. #}
{%- set PG_ENTRY = salt['pillar.get']('telegraf:postgres_creds:' ~ grains.id, {}) %}
{%- set PG_USER = PG_ENTRY.get('user', '') %}
{%- set PG_PASS = PG_ENTRY.get('pass', '') %}
# Global tags can be specified here in key="value" format.
[global_tags]
  role = "{{ GLOBALS.role.split('-') | last }}"

# Configuration for telegraf agent
[agent]
  ## Default data collection interval for all inputs
  interval = "{{ TELEGRAFMERGED.config.interval }}"
  ## Rounds collection interval to 'interval'
  ## ie, if interval="10s" then always collect on :00, :10, :20, etc.
  round_interval = true

  ## Telegraf will send metrics to outputs in batches of at most
  ## metric_batch_size metrics.
  ## This controls the size of writes that Telegraf sends to output plugins.
  metric_batch_size = {{ TELEGRAFMERGED.config.metric_batch_size }}

  ## For failed writes, telegraf will cache metric_buffer_limit metrics for each
  ## output, and will flush this buffer on a successful write. Oldest metrics
  ## are dropped first when this buffer fills.
  ## This buffer only fills when writes fail to output plugin(s).
  metric_buffer_limit = {{ TELEGRAFMERGED.config.metric_buffer_limit }}

  ## Collection jitter is used to jitter the collection by a random amount.
  ## Each plugin will sleep for a random time within jitter before collecting.
  ## This can be used to avoid many plugins querying things like sysfs at the
  ## same time, which can have a measurable effect on the system.
  collection_jitter = "{{ TELEGRAFMERGED.config.collection_jitter }}"

  ## Default flushing interval for all outputs. Maximum flush_interval will be
  ## flush_interval + flush_jitter
  flush_interval = "{{ TELEGRAFMERGED.config.flush_interval }}"
  ## Jitter the flush interval by a random amount. This is primarily to avoid
  ## large write spikes for users running a large number of telegraf instances.
  ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
  flush_jitter = "{{ TELEGRAFMERGED.config.flush_jitter }}"

  ## By default or when set to "0s", precision will be set to the same
  ## timestamp order as the collection interval, with the maximum being 1s.
  ##   ie, when interval = "10s", precision will be "1s"
  ##       when interval = "250ms", precision will be "1ms"
  ## Precision will NOT be used for service inputs. It is up to each individual
  ## service input to set the timestamp at the appropriate precision.
  ## Valid time units are "ns", "us" (or "µs"), "ms", "s".
  precision = ""

  ## Logging configuration:
  ## Run telegraf with debug log messages.
  debug = {{ 'true' if TELEGRAFMERGED.config.debug else 'false' }}
  ## Run telegraf in quiet mode (error log messages only).
  quiet = {{ 'true' if TELEGRAFMERGED.config.quiet else 'false'}}
  ## Specify the log file name. The empty string means to log to stderr.
  logfile = "/var/log/telegraf/telegraf.log"

  ## Override default hostname, if empty use os.Hostname()
  hostname = "{{ GLOBALS.hostname | lower }}"
  ## If set to true, do no set the "host" tag in the telegraf agent.
  omit_hostname = false


###############################################################################
#                            OUTPUT PLUGINS                                   #
###############################################################################

{%- if TG_OUT in ['INFLUXDB', 'BOTH'] %}
# Configuration for sending metrics to InfluxDB
[[outputs.influxdb_v2]]
  urls = ["https://{{ INFLUXDBHOST }}:8086"]
  token = "{{ TOKEN }}"
  organization = "Security Onion"
  bucket = "telegraf/so_short_term"

  ## Optional TLS Config for use on HTTP connections.
   tls_ca = "/etc/telegraf/ca.crt"
   tls_cert = "/etc/telegraf/telegraf.crt"
   tls_key = "/etc/telegraf/telegraf.key"
  ## Use TLS but skip chain & host verification
  # insecure_skip_verify = false
{%- endif %}

{%- if TG_OUT in ['POSTGRES', 'BOTH'] and PG_USER and PG_PASS %}
# Configuration for sending metrics to PostgreSQL.
# options='-c role=so_telegraf' makes every connection SET ROLE to the shared
# group role so tables created on first write are owned by so_telegraf, and
# all per-minion members can INSERT/SELECT them via role inheritance.
# fields_as_jsonb/tags_as_jsonb keep metric tables at a fixed column count so
# high-cardinality inputs (docker, procstat, kafka) don't blow past the
# Postgres 1600-column-per-table limit.
[[outputs.postgresql]]
  connection = "host={{ PG_HOST }} port=5432 user={{ PG_USER }} password={{ PG_PASS }} dbname=so_telegraf sslmode=verify-full sslrootcert=/etc/telegraf/ca.crt options='-c role=so_telegraf'"
  schema = "telegraf"
  tags_as_foreign_keys = true
  tags_as_jsonb = true
  fields_as_jsonb = true
  # Every metric table is a daily time-range partitioned parent managed by
  # pg_partman. Retention drops old partitions instead of row-by-row DELETEs.
  {% raw %}
  # pg_partman 5.x requires the control column (time) to be NOT NULL, so
  # ALTER it before create_parent(). And create_parent() splits
  # p_parent_table on '.' to look up raw identifiers, so the literal must
  # be 'schema.name' (not '"schema"."name"' as .table|quoteLiteral emits).
  # IF NOT EXISTS keeps the three templates idempotent so a Telegraf
  # restart after any DB-side surgery re-runs them safely.
  create_templates = [
    '''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}) PARTITION BY RANGE ("time")''',
    '''ALTER TABLE {{ .table }} ALTER COLUMN "time" SET NOT NULL''',
    '''SELECT partman.create_parent(p_parent_table := {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3) WHERE NOT EXISTS (SELECT 1 FROM partman.part_config WHERE parent_table = {{ printf "%s.%s" .table.Schema .table.Name | quoteLiteral }})'''
  ]
  tag_table_create_templates = [
    '''CREATE TABLE IF NOT EXISTS {{ .table }} ({{ .columns }}, PRIMARY KEY (tag_id))'''
  ]
  {% endraw %}
{%- endif %}

###############################################################################
#                            PROCESSOR PLUGINS                                #
###############################################################################

###############################################################################
#                            AGGREGATOR PLUGINS                               #
###############################################################################

###############################################################################
#                            INPUT PLUGINS                                    #
###############################################################################

# Read metrics about cpu usage
[[inputs.cpu]]
  ## Whether to report per-cpu stats or not
  percpu = true
  ## Whether to report total system cpu stats or not
  totalcpu = true
  ## If true, collect raw CPU time metrics.
  collect_cpu_time = false
  ## If true, compute and report the sum of all non-idle CPU states.
  report_active = false


# Read metrics about disk usage by mount point
[[inputs.disk]]
  ## By default stats will be gathered for all mount points.
  ## Set mount_points will restrict the stats to only the specified mount points.
  mount_points = ["/host",
                  "/host/nsm",
                  "/host/var",
                  "/host/var/log",
                  "/host/var/log/audit",
                  "/host/var/tmp"
                 ]

  ## Ignore mount points by filesystem type.
  #ignore_fs = ["tmpfs", "devtmpfs", "devfs", "overlay", "aufs", "squashfs"]


# Read metrics about disk IO by device
[[inputs.diskio]]
  ## By default, telegraf will gather stats for all devices including
  ## disk partitions.
  ## Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb", "vd*"]
  ## Uncomment the following line if you need disk serial numbers.
  # skip_serial_number = false
  #
  ## On systems which support it, device metadata can be added in the form of
  ## tags.
  ## Currently only Linux is supported via udev properties. You can view
  ## available properties for a device by running:
  ## 'udevadm info -q property -n /dev/sda'
  # device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"]
  #
  ## Using the same metadata source as device_tags, you can also customize the
  ## name of the device via templates.
  ## The 'name_templates' parameter is a list of templates to try and apply to
  ## the device. The template may contain variables in the form of '$PROPERTY' or
  ## '${PROPERTY}'. The first template which does not contain any variables not
  ## present for the device is used as the device name tag.
  ## The typical use case is for LVM volumes, to get the VG/LV name instead of
  ## the near-meaningless DM-0 name.
  # name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"]


# Get kernel statistics from /proc/stat
[[inputs.kernel]]
  # no configuration

# Read metrics about memory usage
[[inputs.mem]]
  # no configuration

# Get the number of processes and group them by status
[[inputs.processes]]
  # no configuration

# Read metrics about swap memory usage
[[inputs.swap]]
  # no configuration

# Read metrics about system load & uptime
[[inputs.system]]
  # no configuration

# # Collect bond interface status, slaves statuses and failures count
 [[inputs.bond]]
#   ## Sets 'proc' directory path
#   ## If not specified, then default is /proc
#   # host_proc = "/proc"
#
#   ## By default, telegraf gather stats for all bond interfaces
#   ## Setting interfaces will restrict the stats to the specified
#   ## bond interfaces.
#   # bond_interfaces = ["bond0"]

# # Read metrics about docker containers
 [[inputs.docker]]
#   ## Docker Endpoint
#   ##   To use TCP, set endpoint = "tcp://[ip]:[port]"
#   ##   To use environment variables (ie, docker-machine), set endpoint = "ENV"
   endpoint = "unix:///var/run/docker.sock"
#

# # Read stats from one or more Elasticsearch servers or clusters
{%- if GLOBALS.is_manager or GLOBALS.role == 'so-heavynode' %}
[[inputs.elasticsearch]]
  servers = ["https://{{ NODEIP }}:9200"]
  cluster_stats = true
  username = "{{ ES_USER }}"
  password = "{{ ES_PASS }}"
  insecure_skip_verify = true
{%- elif grains['role'] in ['so-searchnode']  %}
[[inputs.elasticsearch]]
  servers = ["https://{{ NODEIP }}:9200"]
  cluster_stats = false
  username = "{{ ES_USER }}"
  password = "{{ ES_PASS }}"
  insecure_skip_verify = true
{%- endif %}

#   ## Timeout for HTTP requests to the elastic search server(s)
#   http_timeout = "5s"
#
#   ## When local is true (the default), the node will read only its own stats.
#   ## Set local to false when you want to read the node stats from all nodes
#   ## of the cluster.
#   local = true
#
#   ## Set cluster_health to true when you want to also obtain cluster health stats
#   cluster_health = false
#
#   ## Adjust cluster_health_level when you want to also obtain detailed health stats
#   ## The options are
#   ##  - indices (default)
#   ##  - cluster
#   # cluster_health_level = "indices"
#
#   ## Set cluster_stats to true when you want to also obtain cluster stats from the
#   ## Master node.
#   cluster_stats = false
#
#   ## node_stats is a list of sub-stats that you want to have gathered. Valid options
#   ## are "indices", "os", "process", "jvm", "thread_pool", "fs", "transport", "http",
#   ## "breaker". Per default, all stats are gathered.
#   # node_stats = ["jvm", "http"]
#
#   ## Optional TLS Config
#   # tls_ca = "/etc/telegraf/ca.pem"
#   # tls_cert = "/etc/telegraf/cert.pem"
#   # tls_key = "/etc/telegraf/key.pem"
#   ## Use TLS but skip chain & host verification
#   # insecure_skip_verify = false

{#- Fleet nodes do not have pillar access to logstash credentials #}
{%- if LOGSTASH_ENABLED and grains.role != 'so-fleet' %}
[[inputs.logstash]]
  url = "http://localhost:9600"
  collect = ["pipelines"]
  username = "{{ salt['pillar.get']('elasticsearch:auth:users:so_logstash_user:user') }}"
  password = "{{ salt['pillar.get']('elasticsearch:auth:users:so_logstash_user:pass') }}"
{%- endif %}

{% if grains.role in ['so-manager','so-managersearch','so-standalone','so-receiver'] and GLOBALS.pipeline == "KAFKA" -%}
[[inputs.jolokia2_agent]]
  name_prefix= "kafka_"
  urls = ["http://{{ NODEIP }}:8778/jolokia"]

[[inputs.jolokia2_agent.metric]]
  name         = "topics"
  mbean        = "kafka.server:name=*,type=BrokerTopicMetrics"
  field_prefix = "$1."

[[inputs.jolokia2_agent.metric]]
  name         = "topic"
  mbean        = "kafka.server:name=*,topic=*,type=BrokerTopicMetrics"
  field_prefix = "$1."
  tag_keys     = ["topic"]

[[inputs.jolokia2_agent.metric]]
  name         = "controller"
  mbean        = "kafka.controller:name=*,type=*"
  field_prefix = "$1."

[[inputs.jolokia2_agent.metric]]
  name       = "partition"
  mbean      = "kafka.log:name=*,partition=*,topic=*,type=Log"
  field_name = "$1"
  tag_keys   = ["topic", "partition"]

[[inputs.jolokia2_agent.metric]]
  name       = "partition"
  mbean      = "kafka.cluster:name=UnderReplicated,partition=*,topic=*,type=Partition"
  field_name = "UnderReplicatedPartitions"
  tag_keys   = ["topic", "partition"]

{%- endif %}
# # Read metrics from one or more commands that can output to stdout
{%- if 'sostatus.sh' in TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] %}
{%-   do TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]].remove('sostatus.sh') %}
[[inputs.exec]]
   commands = [
      "/scripts/sostatus.sh"
   ]
   data_format = "influx"
   timeout = "15s"
   interval = "60s"
{%- endif %}

{%- if TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] | length > 0 %}
[[inputs.exec]]
   commands = [
{%-   for script in TELEGRAFMERGED.scripts[GLOBALS.role.split('-')[1]] %}
      "/scripts/{{script}}"{% if not loop.last %},{% endif %}
{%-   endfor %}
   ]
  data_format = "influx"
  ## Timeout for each command to complete.
  timeout = "15s"
{%- endif %}

{%- if salt['pillar.get']('healthcheck:enabled', False) %}
[[inputs.file]]
  files = ["/host/nsm/zeek/logs/zeek_restart.log"]
  data_format = "influx"
{%- endif %}

[[inputs.file]]
  files = ["/etc/telegraf/node_config.json"]
  name_override = "node_config"
  data_format = "json"
  interval = "5m"
  json_string_fields = ['manint', 'monint']
  tag_keys = ['role']

# # Read metrics about network interface usage
 [[inputs.net]]

# Scripts run every 30s||TELEGRAFMERGED.config.interval - ES index script doesn't need to run as frequently
{%- if GLOBALS.is_manager or GLOBALS.role == 'so-heavynode' %}
[[ inputs.exec ]]
  commands = [
    "/scripts/esindexsize.sh"
  ]
  data_format = "influx"
  interval = "1h"
  timeout = "120s"
{%- endif %}