From 5de59a879a34c607ebbaa9fa03d8b998b0d2dc55 Mon Sep 17 00:00:00 2001
From: Wes <wlambertts@gmail.com>
Date: Fri, 26 May 2023 13:15:27 +0000
Subject: [PATCH] Break out of index deletion when unable to bring space below
 the disk space threshold

---
 .../so-curator-cluster-delete-delete          | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/salt/curator/tools/sbin_jinja/so-curator-cluster-delete-delete b/salt/curator/tools/sbin_jinja/so-curator-cluster-delete-delete
index 81d2720c1..388c32b0d 100755
--- a/salt/curator/tools/sbin_jinja/so-curator-cluster-delete-delete
+++ b/salt/curator/tools/sbin_jinja/so-curator-cluster-delete-delete
@@ -10,54 +10,58 @@
 {%- set RETENTION = salt['pillar.get']('elasticsearch:retention', ELASTICDEFAULTS.elasticsearch.retention, merge=true) -%}
 
 LOG="/opt/so/log/curator/so-curator-cluster-delete.log"
-LOG_SIZE_LIMIT=$(/usr/sbin/so-elasticsearch-cluster-space-total {{ RETENTION.retention_pct}})
-TODAY=$(date +'%Y.%m.%d)
-
-eligible_indices() {
-    [[ $(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep 'open$' | awk '{print $1}' | grep -vE "playbook|so-case|$TODAY" | grep -E "(logstash-|so-|.ds-logs-)" | wc -l) -ge 1 ]]
-}
+ALERT_LOG="/opt/so/log/curator/alert.log"
+LOG_SIZE_LIMIT_GB=$(/usr/sbin/so-elasticsearch-cluster-space-total {{ RETENTION.retention_pct}})
+LOG_SIZE_LIMIT=$(( "$LOG_SIZE_LIMIT_GB" * 1024 * 1024 * 1024 ))
+ITERATION=0
+MAX_ITERATIONS=10
 
 overlimit() {
-         [[ $(/usr/sbin/so-elasticsearch-cluster-space-used) -gt "${LOG_SIZE_LIMIT}" ]]
+         [[ $(/usr/sbin/so-elasticsearch-cluster-space-used) -gt ${LOG_SIZE_LIMIT} ]]
 }
 
 ###########################
 # Check for 2 conditions: #
 ###########################
 # 1. Check if Elasticsearch indices are using more disk space than LOG_SIZE_LIMIT
-# 2. Check if Elasticsearch indices are eligible for deletion -- they cannot be Playbook, SOC, today's, or other important indices
-# Closed indices will be deleted first. If we are able to bring disk space under LOG_SIZE_LIMIT, we will break out of the loop.
+# 2. Check if the maximum number of iterations - MAX_ITERATIONS - has been exceeded. If so, exit.
+# Closed indices will be deleted first. If we are able to bring disk space under LOG_SIZE_LIMIT, or the number of iterations has exceeded the maximum allowed number of iterations, we will break out of the loop.
+
+while overlimit && [[ $ITERATION -lt $MAX_ITERATIONS ]]; do
 
-while overlimit && eligible_indices; do
   # If we can't query Elasticsearch, then immediately return false.
   /usr/sbin/so-elasticsearch-query _cat/indices?h=index,status > /dev/null 2>&1
   [ $? -eq 1 ] && echo "$(date) - Could not query Elasticsearch." >> ${LOG} && exit
+
   # We iterate through the closed and open indices
-  CLOSED_INDICES=$(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep 'close$' | awk '{print $1}' | grep -v "so-case" | grep -E "(logstash-|so-|.ds-logs-)" | sort -t- -k3)
-  OPEN_INDICES=$(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep 'open$' | awk '{print $1}' | grep -v "so-case" | grep -E "(logstash-|so-|.ds-logs-)" | sort -t- -k3)
-	for INDEX in ${CLOSED_INDICES} ${OPEN_INDICES}; do
-	  # Now that we've sorted the indices from oldest to newest, we need to check each index to see if it is assigned as the current write index for a data stream
+  CLOSED_INDICES=$(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep 'close$' | awk '{print $1}' | grep -vE "playbook|so-case" | grep -E "(logstash-|so-|.ds-logs-)" | sort -t- -k3)
+  OPEN_INDICES=$(/usr/sbin/so-elasticsearch-query _cat/indices?h=index,status | grep 'open$' | awk '{print $1}' | grep -vE "playbook|so-case" | grep -E "(logstash-|so-|.ds-logs-)" | sort -t- -k3)
+
+  for INDEX in ${CLOSED_INDICES} ${OPEN_INDICES}; do
+    # Now that we've sorted the indices from oldest to newest, we need to check each index to see if it is assigned as the current write index for a data stream
     # To do so, we need to identify to which data stream this index is associated
     # We extract the data stream name using the pattern below
     DATASTREAM_PATTERN="logs-[a-zA-Z_.]+-[a-zA-Z_.]+"
     DATASTREAM=$(echo "${INDEX}" | grep -oE "$DATASTREAM_PATTERN")
     # We look up the data stream, and determine the write index. If there is only one backing index, we delete the entire data stream
-	  BACKING_INDICES=$(/usr/sbin/so-elasticsearch-query _data_stream/${DATASTREAM} | jq -r '.data_streams[0].indices | length')
-	  if [ "$BACKING_INDICES" -gt 1 ]; then
+    BACKING_INDICES=$(/usr/sbin/so-elasticsearch-query _data_stream/${DATASTREAM} | jq -r '.data_streams[0].indices | length')
+    if [ "$BACKING_INDICES" -gt 1 ]; then
       CURRENT_WRITE_INDEX=$(/usr/sbin/so-elasticsearch-query _data_stream/$DATASTREAM | jq -r .data_streams[0].indices[-1].index_name)
-	    # We make sure we are not trying to delete a write index
+      # We make sure we are not trying to delete a write index
       if [ "${INDEX}" != "${CURRENT_WRITE_INDEX}" ]; then
         # This should not be a write index, so we should be allowed to delete it
-        printf "\n$(date) - Used disk space exceeds LOG_SIZE_LIMIT (${LOG_SIZE_LIMIT} GB) - Deleting ${INDEX} index...\n" >> ${LOG}
+        printf "\n$(date) - Used disk space exceeds LOG_SIZE_LIMIT (${LOG_SIZE_LIMIT_GB} GB) - Deleting ${INDEX} index...\n" >> ${LOG}
         /usr/sbin/so-elasticsearch-query ${INDEX} -XDELETE >> ${LOG} 2>&1
       fi
-	  else
-      # We delete the entire data stream, since there is only one backing index
-	    printf "\n$(date) - Used disk space exceeds LOG_SIZE_LIMIT (${LOG_SIZE_LIMIT} GB) - Deleting ${DATASTREAM} data stream...\n" >> ${LOG}
-      /usr/sbin/so-elasticsearch-query _data_stream/${DATASTREAM} -XDELETE >> ${LOG} 2>&1
-	  fi
-    if ! overlimit; then
+    fi
+    if ! overlimit ; then
       exit
     fi
+    ((ITERATION++))
+    if [[ $ITERATION -ge $MAX_ITERATIONS ]]; then
+      alert_id=$(uuidgen)
+      printf "\n$(date) -> Maximum iteration limit reached ($MAX_ITERATIONS). Unable to bring disk below threshold. Writing alert ($alert_id) to ${ALERT_LOG}\n" >> ${LOG}
+      printf "\n$(date),$alert_id,Maximum iteration limit reached ($MAX_ITERATIONS). Unable to bring disk below threshold.\n" >> ${ALERT_LOG}
+    fi
   done
 done