Merge pull request #10024 from Security-Onion-Solutions/esspace

Manage disk-based index deletion via so-curator-cluster-delete
2026-01-30 20:03:32 +01:00 · 2023-03-28 12:25:19 -04:00
parent 44c696a495 6099a04e41
commit de902ebd02
12 changed files with 205 additions and 161 deletions
--- a/salt/common/tools/sbin/so-elasticsearch-cluster-space-total
+++ b/salt/common/tools/sbin/so-elasticsearch-cluster-space-total
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+
+TOTAL_AVAILABLE_SPACE=0
+
+# Wait for ElasticSearch to initialize
+COUNT=0
+ELASTICSEARCH_CONNECTED="no"
+while [[ "$COUNT" -le 240 ]]; do
+      /usr/sbin/so-elasticsearch-query / -k --output /dev/null --silent --head --fail
+        if [ $? -eq 0 ]; then
+                ELASTICSEARCH_CONNECTED="yes"
+                break
+        else
+                ((COUNT+=1))
+                sleep 1
+        fi
+done
+if [ "$ELASTICSEARCH_CONNECTED" == "no" ]; then
+        echo
+        echo -e "Connection attempt timed out.  Unable to connect to ElasticSearch.  \nPlease try: \n  -checking log(s) in /var/log/elasticsearch/\n  -running 'sudo docker ps' \n  -running 'sudo so-elastic-restart'"
+        echo
+        exit 1
+fi
+
+# Set percentage of space to desired value, otherwise use a default value of 80 percent
+if [[ "$1" != "" ]]; then
+  PERCENTAGE=$1
+else
+  PERCENTAGE=80
+fi
+
+# Iterate through the output of _cat/allocation for each node in the cluster to determine the total available space
+{% if GLOBALS.role == 'so-manager' %}
+for i in $(/usr/sbin/so-elasticsearch-query _cat/allocation | grep -v {{ GLOBALS.manager }} | awk '{print $5}'); do
+{% else %}
+for i in $(/usr/sbin/so-elasticsearch-query _cat/allocation | awk '{print $5}'); do
+{% endif %}
+  size=$(echo $i | grep -oE '[0-9].*' | awk '{print int($1+0.5)}')
+  unit=$(echo $i | grep -oE '[A-Za-z]+')
+  if [ $unit = "tb" ]; then
+    size=$(( size * 1024 ))
+  fi
+  TOTAL_AVAILABLE_SPACE=$(( TOTAL_AVAILABLE_SPACE + size ))
+done
+
+# Calculate the percentage of available space based on our previously defined value
+PERCENTAGE_AVAILABLE_SPACE=$(( TOTAL_AVAILABLE_SPACE*PERCENTAGE/100 ))
+echo "$PERCENTAGE_AVAILABLE_SPACE"
--- a/salt/common/tools/sbin/so-elasticsearch-cluster-space-used
+++ b/salt/common/tools/sbin/so-elasticsearch-cluster-space-used
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+. /usr/sbin/so-common
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+
+TOTAL_AVAILABLE_SPACE=0
+
+# Iterate through the output of _cat/allocation for each node in the cluster to determine the total available space
+{% if GLOBALS.role == 'so-manager' %}
+for i in $(/usr/sbin/so-elasticsearch-query _cat/allocation | grep -v {{ GLOBALS.manager }} | awk '{print $3}'); do
+{% else %}
+for i in $(/usr/sbin/so-elasticsearch-query _cat/allocation | awk '{print $3}'); do
+{% endif %}
+  size=$(echo $i | grep -oE '[0-9].*' | awk '{print int($1+0.5)}')
+  unit=$(echo $i | grep -oE '[A-Za-z]+')
+  if [ $unit = "tb" ]; then
+    size=$(( size * 1024 ))
+  fi
+  TOTAL_AVAILABLE_SPACE=$(( TOTAL_AVAILABLE_SPACE + size ))
+done
+
+# Calculate the percentage of available space based on our previously defined value
+echo "$TOTAL_AVAILABLE_SPACE"
--- a/salt/curator/files/action/delete.yml
+++ b/salt/curator/files/action/delete.yml
@@ -3,6 +3,11 @@
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.

+{% import_yaml 'elasticsearch/defaults.yaml' as ELASTICDEFAULTS %}
+{% set ELASTICMERGED = salt['pillar.get']('elasticsearch:retention', ELASTICDEFAULTS.elasticsearch.retention, merge=true) %}
+
+{{ ELASTICMERGED.retention_pct }}
+
 {%- set log_size_limit = salt['pillar.get']('elasticsearch:log_size_limit') %}
 actions:
  1:
--- a/salt/curator/files/bin/so-curator-closed-delete
+++ b/salt/curator/files/bin/so-curator-closed-delete
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
-# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
-# https://securityonion.net/license; you may not use this file except in compliance with the
-# Elastic License 2.0.
-
-
-#. /usr/sbin/so-elastic-common
-#. /etc/nsm/securityonion.conf
-
-# If logrotate script doesn't already exist, create it
-#FILE="/etc/logrotate.d/so-curator-closed-delete"
-#if ! [ -f ${FILE} ]; then
-#        cat << EOF > ${FILE}
-#/var/log/nsm/so-curator-closed-delete.log {
-#    daily
-#    rotate 7
-#    copytruncate
-#    compress
-#    missingok
-#    notifempty
-#}
-#EOF
-#fi
-
-# Avoid starting multiple instances
-APP=closeddelete
-lf=/tmp/$APP-pidLockFile
-# create empty lock file if none exists
-cat /dev/null >> $lf
-read lastPID < $lf
-# if lastPID is not null and a process with that pid exists , exit
-[ ! -z "$lastPID" -a -d /proc/$lastPID ] && exit
-echo $$ > $lf
-
-/usr/sbin/so-curator-closed-delete-delete
--- a/salt/curator/files/bin/so-curator-closed-delete-delete
+++ b/salt/curator/files/bin/so-curator-closed-delete-delete
@@ -1,61 +0,0 @@
-#!/bin/bash
-# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
-# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
-# https://securityonion.net/license; you may not use this file except in compliance with the
-# Elastic License 2.0.
-
-{% from 'vars/globals.map.jinja' import GLOBALS %}
-{%- if grains['role'] in ['so-searchnode', 'so-heavynode'] %}
-  {%- set ELASTICSEARCH_HOST = GLOBALS.node_ip -%}  
-  {%- set ELASTICSEARCH_PORT = salt['pillar.get']('elasticsearch:es_port') -%}
-{%- elif grains['role'] in ['so-eval', 'so-managersearch', 'so-standalone', 'so-manager'] %}
-  {%- set ELASTICSEARCH_HOST = GLOBALS.manager_ip -%}
-  {%- set ELASTICSEARCH_PORT = salt['pillar.get']('manager:es_port') -%}
-{%- endif -%}
-{%- set LOG_SIZE_LIMIT = salt['pillar.get']('elasticsearch:log_size_limit') -%}
-
-# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
-# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
-# https://securityonion.net/license; you may not use this file except in compliance with the
-# Elastic License 2.0.
-
-
-
-LOG="/opt/so/log/curator/so-curator-closed-delete.log"
-
-overlimit() {
-
-        [[ $(du -hs --block-size=1GB /nsm/elasticsearch/nodes | awk '{print $1}' ) -gt "{{LOG_SIZE_LIMIT}}" ]]
-}
-
-closedindices() {
-
-	# If we can't query Elasticsearch, then immediately return false.
-        curl -K /opt/so/conf/elasticsearch/curl.config -s -k https://{{ELASTICSEARCH_HOST}}:{{ELASTICSEARCH_PORT}}/_cat/indices?h=index\&expand_wildcards=closed >/dev/null 2>&1
-        [ $? -eq 1 ] && return false 
-	# First, get the list of closed indices using _cat/indices?h=index\&expand_wildcards=closed.
-	# Next, filter out any so-case indices.
-	# Finally, use grep's -q option to return true if there are any remaining logstash- or so- indices.
-        curl -K /opt/so/conf/elasticsearch/curl.config -s -k https://{{ELASTICSEARCH_HOST}}:{{ELASTICSEARCH_PORT}}/_cat/indices?h=index\&expand_wildcards=closed | grep -v "so-case" | grep -q -E "(logstash-|so-)"
-}
-
-# Check for 2 conditions:
-# 1. Are Elasticsearch indices using more disk space than LOG_SIZE_LIMIT?
-# 2. Are there any closed indices that we can delete?
-# If both conditions are true, keep on looping until one of the conditions is false.
-while overlimit && closedindices; do
-
-	# We need to determine OLDEST_INDEX:
-	# First, get the list of closed indices using _cat/indices?h=index\&expand_wildcards=closed.
-	# Next, filter out any so-case indices and only select the remaining logstash- or so- indices.
-	# Then, sort by date by telling sort to use hyphen as delimiter and sort on the third field.
-	# Finally, select the first entry in that sorted list.
-	OLDEST_INDEX=$(curl -K /opt/so/conf/elasticsearch/curl.config -s -k https://{{ELASTICSEARCH_HOST}}:{{ELASTICSEARCH_PORT}}/_cat/indices?h=index\&expand_wildcards=closed | grep -v "so-case" | grep -E "(logstash-|so-)" | sort -t- -k3 | head -1)
-	
-	# Now that we've determined OLDEST_INDEX, ask Elasticsearch to delete it.
-	curl -K /opt/so/conf/elasticsearch/curl.config-XDELETE -k https://{{ELASTICSEARCH_HOST}}:{{ELASTICSEARCH_PORT}}/${OLDEST_INDEX}
-
-	# Finally, write a log entry that says we deleted it.
-	echo "$(date) - Used disk space exceeds LOG_SIZE_LIMIT ({{LOG_SIZE_LIMIT}} GB) - Index ${OLDEST_INDEX} deleted ..." >> ${LOG}
-
-done
--- a/salt/curator/files/bin/so-curator-cluster-close
+++ b/salt/curator/files/bin/so-curator-cluster-close
--- a/salt/curator/files/bin/so-curator-cluster-delete
+++ b/salt/curator/files/bin/so-curator-cluster-delete
@@ -4,7 +4,8 @@
 # https://securityonion.net/license; you may not use this file except in compliance with the
 # Elastic License 2.0.

-APP=delete
+# Avoid starting multiple instances
+APP=clusterdelete
 lf=/tmp/$APP-pidLockFile
 # create empty lock file if none exists
 cat /dev/null >> $lf
@@ -13,18 +14,4 @@ read lastPID < $lf
 [ ! -z "$lastPID" -a -d /proc/$lastPID ] && exit
 echo $$ > $lf

-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-zeek-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-beats-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-firewall-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-ids-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-import-delete.yml > /dev/null 2>&1;
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-kratos-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-osquery-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-ossec-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-strelka-delete.yml > /dev/null 2>&1; 
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/so-syslog-delete.yml > /dev/null 2>&1;
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/logs-import-so-delete.yml > /dev/null 2>&1;
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/logs-strelka-delete.yml > /dev/null 2>&1;
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/logs-suricata-delete.yml > /dev/null 2>&1;
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/logs-syslog-delete.yml > /dev/null 2>&1;
-docker exec so-curator curator --config /etc/curator/config/curator.yml /etc/curator/action/logs-zeek-delete.yml > /dev/null 2>&1;
+/usr/sbin/so-curator-cluster-delete-delete
--- a/salt/curator/files/bin/so-curator-cluster-delete-delete
+++ b/salt/curator/files/bin/so-curator-cluster-delete-delete
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one
+# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at 
+# https://securityonion.net/license; you may not use this file except in compliance with the
+# Elastic License 2.0.
+
+{% from 'vars/globals.map.jinja' import GLOBALS %}
+{% import_yaml 'elasticsearch/defaults.yaml' as ELASTICDEFAULTS %}
+{%- set ELASTICSEARCH_HOST = GLOBALS.node_ip -%}
+{%- set RETENTION = salt['pillar.get']('elasticsearch:retention', ELASTICDEFAULTS.elasticsearch.retention, merge=true) -%}
+
+LOG="/opt/so/log/curator/so-curator-cluster-delete.log"
+LOG_SIZE_LIMIT=$(/usr/sbin/so-elasticsearch-cluster-space-total {{ RETENTION.retention_pct}})
+
+overlimit() {
+         [[ $(/usr/sbin/so-elasticsearch-cluster-space-used) -gt "${LOG_SIZE_LIMIT}" ]]
+}
+
+closedindices() {
+	# If we can't query Elasticsearch, then immediately return false.
+        /usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep close > /dev/null 2>&1
+        [ $? -eq 1 ] && return false 
+	# First, get the list of closed indices using _cat/indices?h=index,status | grep close | awk '{print $1}'.
+	# Next, filter out any so-case indices.
+	# Finally, use grep's -q option to return true if there are any remaining logstash-, so-, or .ds-logs- indices.
+        /usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep close | awk '{print $1}' | grep -v "so-case" | grep -q -E "(logstash-|so-|.ds-logs-)"
+}
+
+# Check for 2 conditions:
+# 1. Are Elasticsearch indices using more disk space than LOG_SIZE_LIMIT?
+# 2. Are there any closed indices that we can delete?
+# If both conditions are true, keep on looping until one of the conditions is false.
+
+while overlimit && closedindices; do
+        CLOSED_INDICES=$(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep close | awk '{print $1}' | grep -v "so-case" | grep -E "(logstash-|so-|.ds-logs-)" | sort -t- -k3)
+        # We iterate through the closed indices
+	for CLOSED_INDEX in ${CLOSED_INDICES}; do
+	  # Now that we've sorted the indices from oldest to newest, we need to check each index to see if it is assigned as the current write index for a data stream
+          # To do so, we need to identify to which data stream this index is associated
+          # We extract the data stream name using the pattern below
+          DATASTREAM_PATTERN="logs-[a-zA-Z_.]+-[a-zA-Z_.]+"
+          DATASTREAM=$(echo "${CLOSED_INDEX}" | grep -oE "$DATASTREAM_PATTERN")
+	  # We look up the data stream, and determine the write index
+          CURRENT_WRITE_INDEX=$(/usr/sbin/so-elasticsearch-query _data_stream/$DATASTREAM | jq -r .data_streams[0].indices[-1].index_name)
+	  # We make sure we are not trying to delete a write index
+          if [ "${CLOSED_INDEX}" != "${CURRENT_WRITE_INDEX}" ]; then      
+            # This should not be a write index, so we should be allowed to delete it
+            /usr/sbin/so-elasticsearch-query ${CLOSED_INDEX} -XDELETE
+            # Finally, write a log entry that says we deleted it.
+            echo "$(date) - Used disk space exceeds LOG_SIZE_LIMIT (${LOG_SIZE_LIMIT} GB) - Index ${CLOSED_INDEX} deleted ..." >> ${LOG}
+          fi
+          if ! overlimit; then
+            exit
+          fi
+        done
+done
+
+while overlimit; do
+
+	# We need to determine the oldest open index.
+	# First, get the list of open indices using _cat/indices?h=index,status | grep open | awk '{print $1}'.
+        # Next, filter out any so-case indices and only select the remaining logstash-, so-, or .ds-logs- indices.
+        # Then, sort by date by telling sort to use hyphen as delimiter and sort on the third field.
+	OPEN_INDICES=$(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep open | awk '{print $1}' | grep -v "so-case" | grep -E "(logstash-|so-|.ds-logs-)" | sort -t- -k3)
+	# We iterate through the open indices
+        for OPEN_INDEX in ${OPEN_INDICES}; do
+	  # Now that we've sorted the indices from oldest to newest, we need to check each index to see if it is assigned as the current write index for a data stream
+          # To do so, we need to identify to which data stream this index is associated
+          # We extract the data stream name using the pattern below
+          DATASTREAM_PATTERN="logs-[a-zA-Z_.]+-[a-zA-Z_.]+"
+          DATASTREAM=$(echo "${OPEN_INDEX}" | grep -oE "$DATASTREAM_PATTERN")
+	  # We look up the data stream, and determine the write index
+          CURRENT_WRITE_INDEX=$(/usr/sbin/so-elasticsearch-query _data_stream/$DATASTREAM | jq -r .data_streams[0].indices[-1].index_name)
+	  # We make sure we are not trying to delete a write index
+          if [ "${OPEN_INDEX}" != "${CURRENT_WRITE_INDEX}" ]; then      
+            # This should not be a write index, so we should be allowed to delete it
+            /usr/sbin/so-elasticsearch-query ${OPEN_INDEX} -XDELETE
+            # Finally, write a log entry that says we deleted it.
+            echo "$(date) - Used disk space exceeds LOG_SIZE_LIMIT (${LOG_SIZE_LIMIT} GB) - Index ${OPEN_INDEX} deleted ..." >> ${LOG}
+          fi
+          if ! overlimit; then
+            exit
+          fi
+        done
+done
--- a/salt/curator/init.sls
+++ b/salt/curator/init.sls
@@ -27,6 +27,12 @@ curator:
    - createhome: False

 # Create the log directory
+curlogdir:
+  file.directory:
+    - name: /opt/so/log/curator
+    - user: 934
+    - group: 939
+
 curactiondir:
  file.directory:
    - name: /opt/so/conf/curator/action
@@ -34,12 +40,6 @@ curactiondir:
    - group: 939
    - makedirs: True

-curlogdir:
-  file.directory:
-    - name: /opt/so/log/curator
-    - user: 934
-    - group: 939
-
 actionconfs:
  file.recurse:
    - name: /opt/so/conf/curator/action
@@ -50,7 +50,6 @@ actionconfs:
    - defaults:
        CURATORMERGED: {{ CURATORMERGED }}
        
-
 curconf:
  file.managed:
    - name: /opt/so/conf/curator/curator.yml
@@ -61,40 +60,6 @@ curconf:
    - template: jinja
    - show_changes: False

-curcloseddel:
-  file.managed:
-    - name: /usr/sbin/so-curator-closed-delete
-    - source: salt://curator/files/bin/so-curator-closed-delete
-    - user: 934
-    - group: 939
-    - mode: 755
-
-curcloseddeldel:
-  file.managed:
-    - name: /usr/sbin/so-curator-closed-delete-delete
-    - source: salt://curator/files/bin/so-curator-closed-delete-delete
-    - user: 934
-    - group: 939
-    - mode: 755
-    - template: jinja
-
-curclose:
-  file.managed:
-    - name: /usr/sbin/so-curator-close
-    - source: salt://curator/files/bin/so-curator-close
-    - user: 934
-    - group: 939
-    - mode: 755
-    - template: jinja
-
-curdel:
-  file.managed:
-    - name: /usr/sbin/so-curator-delete
-    - source: salt://curator/files/bin/so-curator-delete
-    - user: 934
-    - group: 939
-    - mode: 755
-
 curclusterclose: 
  file.managed:
    - name: /usr/sbin/so-curator-cluster-close
@@ -104,13 +69,21 @@ curclusterclose:
    - mode: 755
    - template: jinja

-curclusterdelete: 
+curclusterdelete:
  file.managed:
    - name: /usr/sbin/so-curator-cluster-delete
    - source: salt://curator/files/bin/so-curator-cluster-delete
    - user: 934
    - group: 939
    - mode: 755
+
+curclusterdeletedelete:
+  file.managed:
+    - name: /usr/sbin/so-curator-cluster-delete-delete
+    - source: salt://curator/files/bin/so-curator-cluster-delete-delete
+    - user: 934
+    - group: 939
+    - mode: 755
    - template: jinja

 so-curator:
@@ -163,12 +136,12 @@ so-curatorclusterclose:
    - month: '*'
    - dayweek: '*'

-so-curatorclusterdelete:
+so-curatorclusterdeletecron:
  cron.present:
-    - name: /usr/sbin/so-curator-cluster-delete > /opt/so/log/curator/cron-delete.log 2>&1
+    - name: /usr/sbin/so-curator-cluster-delete > /opt/so/log/curator/cron-cluster-delete.log 2>&1
    - user: root
-    - minute: '2'
-    - hour: '*/1'
+    - minute: '*/5'
+    - hour: '*'
    - daymonth: '*'
    - month: '*'
    - dayweek: '*'
--- a/salt/elasticsearch/defaults.yaml
+++ b/salt/elasticsearch/defaults.yaml
@@ -1,4 +1,6 @@
 elasticsearch:
+  retention:
+    retention_pct: 50
  config:
    node: {}
    cluster:
--- a/salt/elasticsearch/init.sls
+++ b/salt/elasticsearch/init.sls
@@ -407,7 +407,6 @@ so-elasticsearch-roles-load:
      - docker_container: so-elasticsearch
      - file: es_sync_scripts

-
 {% else %}

 {{sls}}_state_not_allowed:
--- a/salt/elasticsearch/soc_elasticsearch.yaml
+++ b/salt/elasticsearch/soc_elasticsearch.yaml
@@ -1,7 +1,12 @@
 elasticsearch:
  esheap:
    description: Specify the memory heap size in (m)egabytes for Elasticsearch.
-    helpLink: elasticsearch.html 
+    helpLink: elasticsearch.html
+  retention: 
+    retention_pct:
+      decription: Total percentage of space used by Elasticsearch for multi node clusters
+      helpLink: elasticsearch.yaml
+      global: True
  config:
    cluster:
      name: