diff --git a/salt/influxdb/templates/alarm_high_redis_memory_usage.json b/salt/influxdb/templates/alarm_high_redis_memory_usage.json new file mode 100644 index 000000000..fe99ad430 --- /dev/null +++ b/salt/influxdb/templates/alarm_high_redis_memory_usage.json @@ -0,0 +1,28 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "high-redis-memory" + }, + "spec": { + "description": "Triggers when the average percent of used memory for Redis reaches a defined threshold. To tune this alert, modify the value for the appropriate alert level.", + "every": "1m", + "name": "High Redis Memory Usage", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"redisqueue\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"mem_used\")\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "The amount of available memory for Redis on the ${r.host} node has reached the ${r._level} threshold. The current percent of used memory is ${r.mem_used}.", + "thresholds": [ + { + "level": "WARN", + "type": "greater", + "value": 80 + }, + { + "level": "CRIT", + "type": "greater", + "value": 90 + } + ] + } +}] + diff --git a/salt/influxdb/templates/alarm_low_monitor_traffic.json b/salt/influxdb/templates/alarm_low_monitor_traffic.json new file mode 100644 index 000000000..167ae1b5a --- /dev/null +++ b/salt/influxdb/templates/alarm_low_monitor_traffic.json @@ -0,0 +1,22 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "monitor-interface-traffic" + }, + "spec": { + "description": "Triggers when the volume of network traffic (in MBs) received on the monitor interface, per sensor, falls below a defined threshold. To tune this alert, modify the value in MBs for the appropriate alert level.", + "every": "1m", + "name": "Low Traffic Volume on Monitor Interface", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"net\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"bytes_recv\")\n |\u003e filter(fn: (r) =\u003e r[\"interface\"] == \"bond0\")\n |\u003e derivative(unit: 1s, nonNegative: true)\n |\u003e map(fn: (r) =\u003e ({r with \"_value\": r._value * 8.0 / 1000000.0}))\n |\u003e yield(name: \"nonnegative derivative\")", + "status": "active", + "statusMessageTemplate": "Interface ${r.interface} on node ${r.host} has reached the ${r._level} threshold. The current volume of traffic on interface ${r.interface} is ${r.bytes_recv}MB/s.", + "thresholds": [ + { + "level": "CRIT", + "type": "lesser", + "value": 5 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_pcap_retention.json b/salt/influxdb/templates/alarm_pcap_retention.json new file mode 100644 index 000000000..969d462c9 --- /dev/null +++ b/salt/influxdb/templates/alarm_pcap_retention.json @@ -0,0 +1,27 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "alarm-pcap-retention" + }, + "spec": { + "description": "Triggers when the PCAP retention (in days), falls below the defined threshold. To tune this alert, modify the value for the appropriate alert level.", + "every": "1m0s", + "name": "Low PCAP Retention", + "query": "from(bucket: \"telegraf/so_short_term\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"_measurement\"] == \"pcapage\")\n |> filter(fn: (r) => r[\"_field\"] == \"seconds\")\n |> map(fn: (r) => ({ r with _value: r._value / (24.0 * 3600.0)})) |\u003e map(fn: (r) =\u003e ({r with _value: int(v: r._value)}))\n |> aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |> yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "PCAP retention on node ${r.host} has reached the ${r._level} threshold. Node ${r.host} currently has approximately ${r.seconds} days of PCAP data.", + "thresholds": [ + { + "level": "CRIT", + "type": "lesser", + "value": 1 + }, + { + "level": "WARN", + "type": "lesser", + "value": 3 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_steno_packet_loss.json b/salt/influxdb/templates/alarm_steno_packet_loss.json new file mode 100644 index 000000000..c5cfb4297 --- /dev/null +++ b/salt/influxdb/templates/alarm_steno_packet_loss.json @@ -0,0 +1,27 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "steno-packet-loss" + }, + "spec": { + "description": "Triggers when the average percent of packet loss is above the defined threshold. To tune this alert, modify the value for the appropriate alert level.", + "every": "1m", + "name": "Stenographer Packet Loss", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"stenodrop\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"drop\")\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "Stenographer Packet Loss on node ${r.host} has reached the ${ r._level } threshold. The current packet loss is ${ r.drop }%.", + "thresholds": [ + { + "level": "CRIT", + "type": "greater", + "value": 5 + }, + { + "level": "WARN", + "type": "greater", + "value": 3 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_suricata_packet_loss.json b/salt/influxdb/templates/alarm_suricata_packet_loss.json new file mode 100644 index 000000000..8a4c3f5cf --- /dev/null +++ b/salt/influxdb/templates/alarm_suricata_packet_loss.json @@ -0,0 +1,27 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "suricata-packet-loss" + }, + "spec": { + "description": "Triggers when the average percent of packet loss is above the defined threshold. To tune this alert, modify the value for the appropriate alert level." + "every": "1m", + "name": "Suricata Packet Loss", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"suridrop\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"drop\")\n |\u003e map(fn: (r) =\u003e ({r with \"_value\": r._value * 100.0}))\n |\u003e map(fn: (r) =\u003e ({ r with _value: int(v: r._value) }))\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "Suricata packet loss on node ${r.host} has reached the ${ r._level } threshold. The current packet loss is ${ r.drop }%.", + "thresholds": [ + { + "level": "CRIT", + "type": "greater", + "value": 5 + }, + { + "level": "WARN", + "type": "greater", + "value": 3 + } + ] + } +}] diff --git a/salt/influxdb/templates/alarm_zeek_packet_loss.json b/salt/influxdb/templates/alarm_zeek_packet_loss.json new file mode 100644 index 000000000..cebd1dc50 --- /dev/null +++ b/salt/influxdb/templates/alarm_zeek_packet_loss.json @@ -0,0 +1,27 @@ +[{ + "apiVersion": "influxdata.com/v2alpha1", + "kind": "CheckThreshold", + "metadata": { + "name": "zeek-packet-loss" + }, + "spec": { + "description": "Triggers when the average percent of packet loss is above the defined threshold. To tune this alert, modify the value for the appropriate alert level." + "every": "1m", + "name": "Zeek Packet Loss", + "query": "from(bucket: \"telegraf/so_short_term\")\n |\u003e range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |\u003e filter(fn: (r) =\u003e r[\"_measurement\"] == \"zeekdrop\")\n |\u003e filter(fn: (r) =\u003e r[\"_field\"] == \"drop\")\n |\u003e map(fn: (r) =\u003e ({r with \"_value\": r._value * 100.0}))\n |\u003e map(fn: (r) =\u003e ({ r with _value: int(v: r._value) }))\n |\u003e aggregateWindow(every: 1m, fn: mean, createEmpty: false)\n |\u003e yield(name: \"mean\")", + "status": "active", + "statusMessageTemplate": "Zeek Packet Loss on node ${r.host} has reached the ${ r._level } threshold. The current packet loss is ${ r.drop }%.", + "thresholds": [ + { + "level": "CRIT", + "type": "greater", + "value": 5 + }, + { + "level": "WARN", + "type": "greater", + "value": 3 + } + ] + } +}] diff --git a/salt/telegraf/scripts/redis.sh b/salt/telegraf/scripts/redis.sh index c730885d4..dba893c87 100644 --- a/salt/telegraf/scripts/redis.sh +++ b/salt/telegraf/scripts/redis.sh @@ -11,8 +11,9 @@ if [[ ! "`pidof -x $(basename $0) -o %PPID`" ]]; then UNPARSED=$(redis-cli llen logstash:unparsed | awk '{print $1}') PARSED=$(redis-cli llen logstash:parsed | awk '{print $1}') - - echo "redisqueue unparsed=$UNPARSED,parsed=$PARSED" + MEM_USED=$(redis-cli info memory | grep used_memory_peak_perc | cut -d ":" -f2 | sed "s/%//") + + echo "redisqueue unparsed=$UNPARSED,parsed=$PARSED,mem_used=$MEM_USED" fi