diff --git a/salt/influxdb/templates/alarm_high_redis_memory_usage.json b/salt/influxdb/templates/alarm_high_redis_memory_usage.json new file mode 100644 index 000000000..98f4d206c --- /dev/null +++ b/salt/influxdb/templates/alarm_high_redis_memory_usage.json @@ -0,0 +1,27 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "high-redis-memory" + }, + "spec": { + "every": "1m", + "name": "High Redis Memory Usage", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"redisqueue\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"mem_used\")\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "The amount of available memory for Redis on the ${r.host} node has reached the ${r._level} threshold. The current percent of used memory is ${r.mem_used}.", + "thresholds": [ + { + "level": "WARN", + "type": "greater", + "value": 80 + }, + { + "level": "CRIT", + "type": "greater", + "value": 90 + } + ] + } +}] + diff --git a/salt/influxdb/templates/alarm_low_monitor_traffic.json b/salt/influxdb/templates/alarm_low_monitor_traffic.json new file mode 100644 index 000000000..910b13803 --- /dev/null +++ b/salt/influxdb/templates/alarm_low_monitor_traffic.json @@ -0,0 +1,21 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "monitor-interface-traffic" + }, + "spec": { + "every": "1m", + "name": "Low Traffic Volume on Monitor Interface", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"net\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"bytes_recv\")\n |\u003e filter(fn: (r) =\u003e r[\"interface\"] == \"bond0\")\n |\u003e derivative(unit: 1s, nonNegative: true)\n |\u003e map(fn: (r) =\u003e ({r with \"_value\": r._value * 8.0 / 1000000.0}))\n |\u003e yield(name: \"nonnegative derivative\")", + "status": "active", + "statusMessageTemplate": "Interface ${r.interface} on node ${r.host} has reached the ${r._level} threshold. The current volume of traffic on interface ${r.interface} is ${r.bytes_recv}MB/s.", + "thresholds": [ + { + "level": "CRIT", + "type": "lesser", + "value": 5 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_pcap_retention.json b/salt/influxdb/templates/alarm_pcap_retention.json new file mode 100644 index 000000000..0964906c7 --- /dev/null +++ b/salt/influxdb/templates/alarm_pcap_retention.json @@ -0,0 +1,27 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "alarm-pcap-retention" + }, + "spec": { + "description": "Percent used space on the root partition of at least one node has exceeded the alarm threshold.", + "every": "1m0s", + "name": "Low PCAP Retention", + "query": "from(bucket: \"telegraf/so_short_term\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"_measurement\"] == \"pcapage\")\n |> filter(fn: (r) => r[\"_field\"] == \"seconds\")\n |> map(fn: (r) => ({ r with _value: r._value / (24.0 * 3600.0)})) |\u003e map(fn: (r) =\u003e ({r with _value: int(v: r._value)}))\n |> aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |> yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "PCAP retention on node ${r.host} has reached the ${r._level} threshold. Node ${r.host} currently has approximately ${r.seconds} days of PCAP data.", + "thresholds": [ + { + "level": "CRIT", + "type": "lesser", + "value": 1 + }, + { + "level": "WARN", + "type": "lesser", + "value": 3 + } + ] + } +}] \ No newline at end of file diff --git a/salt/influxdb/templates/alarm_steno_packet_loss.json b/salt/influxdb/templates/alarm_steno_packet_loss.json new file mode 100644 index 000000000..967b7ff92 --- /dev/null +++ b/salt/influxdb/templates/alarm_steno_packet_loss.json @@ -0,0 +1,26 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "steno-packet-loss" + }, + "spec": { + "every": "1m", + "name": "Stenographer Packet Loss", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"stenodrop\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"drop\")\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "Stenographer Packet Loss on node ${r.host} has reached the ${ r._level } threshold. The current packet loss is ${ r.drop }%.", + "thresholds": [ + { + "level": "CRIT", + "type": "greater", + "value": 5 + }, + { + "level": "WARN", + "type": "greater", + "value": 3 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_suricata_packet_loss.json b/salt/influxdb/templates/alarm_suricata_packet_loss.json new file mode 100644 index 000000000..48bda0ff3 --- /dev/null +++ b/salt/influxdb/templates/alarm_suricata_packet_loss.json @@ -0,0 +1,26 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "suricata-packet-loss" + }, + "spec": { + "every": "1m", + "name": "Suricata Packet Loss", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"suridrop\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"drop\")\n |\u003e map(fn: (r) =\u003e ({r with \"_value\": r._value * 100.0}))\n |\u003e map(fn: (r) =\u003e ({ r with _value: int(v: r._value) }))\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "Suricata packet loss on node ${r.host} has reached the ${ r._level } threshold. The current packet loss is ${ r.drop }%.", + "thresholds": [ + { + "level": "CRIT", + "type": "greater", + "value": 5 + }, + { + "level": "WARN", + "type": "greater", + "value": 3 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_zeek_packet_loss.json b/salt/influxdb/templates/alarm_zeek_packet_loss.json new file mode 100644 index 000000000..33e19ea5b --- /dev/null +++ b/salt/influxdb/templates/alarm_zeek_packet_loss.json @@ -0,0 +1,26 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "zeek-packet-loss" + }, + "spec": { + "every": "1m", + "name": "Zeek Packet Loss", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"zeekdrop\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"drop\")\n |\u003e map(fn: (r) =\u003e ({r with \"_value\": r._value * 100.0}))\n |\u003e map(fn: (r) =\u003e ({ r with _value: int(v: r._value) }))\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "Zeek Packet Loss on node ${r.host} has reached the ${ r._level } threshold. The current packet loss is ${ r.drop }%.", + "thresholds": [ + { + "level": "CRIT", + "type": "greater", + "value": 5 + }, + { + "level": "WARN", + "type": "greater", + "value": 3 + } + ] + } +}]