From 2fb41c8d6529ba6a161a09301f60b999590c25eb Mon Sep 17 00:00:00 2001 From: reyesj2 <94730068+reyesj2@users.noreply.github.com> Date: Wed, 29 Oct 2025 14:24:43 -0500 Subject: [PATCH] elasticsearch retention estimate --- .../sbin/so-elasticsearch-retention-estimate | 1159 +++++++++++++++++ 1 file changed, 1159 insertions(+) create mode 100755 salt/elasticsearch/tools/sbin/so-elasticsearch-retention-estimate diff --git a/salt/elasticsearch/tools/sbin/so-elasticsearch-retention-estimate b/salt/elasticsearch/tools/sbin/so-elasticsearch-retention-estimate new file mode 100755 index 000000000..4c34d3a02 --- /dev/null +++ b/salt/elasticsearch/tools/sbin/so-elasticsearch-retention-estimate @@ -0,0 +1,1159 @@ +#!/bin/bash + +# Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one +# or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at +# https://securityonion.net/license; you may not use this file except in compliance with the +# Elastic License 2.0. + +INFLUX_URL="https://localhost:8086/api/v2" +JSON_OUTPUT=false +VERBOSE=false +TEMP_FILES=() + +. /usr/sbin/so-common + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +BOLD='\033[1;37m' +NC='\033[0m' +REDBOLD='\033[1;31m' +YELLOWBOLD='\033[1;33m' + +declare -a recommendation_lines +declare -a recommendation_records + +cleanup_temp_files() { + local file + for file in "${TEMP_FILES[@]}"; do + [ -f "$file" ] && rm -f "$file" 2>/dev/null + done +} + +trap cleanup_temp_files EXIT INT TERM + +create_temp_file() { + local tmpfile + tmpfile=$(mktemp) + TEMP_FILES+=("$tmpfile") + echo "$tmpfile" +} + +log_title() { + if [ $1 == "LOG" ]; then + echo -e "\n${BOLD}================ $2 ================${NC}\n" + elif [ $1 == "OK" ]; then + echo -e "${GREEN} $2 ${NC}" + elif [ $1 == "WARN" ]; then + echo -e "${YELLOW} $2 ${NC}" + elif [ $1 == "ERROR" ]; then + echo -e "${RED} $2 ${NC}" + fi +} + +usage() { + cat << EOF +Usage: $(basename "$0") [OPTIONS] + +Estimate remaining days until Elasticsearch cluster reaches low watermark threshold. + +OPTIONS: + --json Output results in JSON format + -v, --verbose Show additional output + -h, --help Show this help message + +EOF + exit 0 +} + +while [[ $# -gt 0 ]]; do + case $1 in + --json) + JSON_OUTPUT=true + shift + ;; + -v|--verbose) + VERBOSE=true + shift + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" >&2 + usage + ;; + esac +done + +request() { + curl -skK /opt/so/conf/influxdb/curl.config "$INFLUX_URL/$@" +} + +lookup_org_id() { + request "orgs?org=Security+Onion" | jq -r '.orgs[] | select(.name == "Security Onion").id' +} + +run_flux_query() { + local query=$1 + request "query?org=$ORG_ID" \ + -H 'Accept:application/csv' \ + -H 'Content-type:application/vnd.flux' \ + -d "$query" -XPOST 2>/dev/null +} + +read_csv_value() { + local input="$1" + + printf '%s\n' "$input" | awk -F',' ' + $0 ~ /^#/ { next } + NF < 1 { next } + { + gsub(/\r|\t/, "") + for (i = 1; i <= NF; i++) { + sub(/^[[:space:]]+/, "", $i) + sub(/[[:space:]]+$/, "", $i) + } + if (($2 == "_result" || $2 == "result") && $3 != "table" && $NF != "") { + print $NF + exit + } + } + ' +} + +normalize_number() { + local value="${1:-0}" + awk -v val="$value" 'BEGIN { + if (val == "" || val == "null") { printf "0"; exit } + if (val == val + 0) { printf "%.0f", val + 0; exit } + printf "0" + }' +} + +bytes_to_gb() { + local bytes="${1:-0}" + awk -v b="$bytes" 'BEGIN { + if (b == "" || b == "null") { printf "0.00"; exit } + printf "%.2f", b / 1024 / 1024 / 1024 + }' +} + +expand_node_roles() { + local role_string="$1" + local -a roles=() + + # Only show data-related roles: d=data, h=data_hot, w=data_warm, c=data_cold, s=data_content f=data_frozen + [[ "$role_string" =~ h ]] && roles+=("data_hot") + [[ "$role_string" =~ w ]] && roles+=("data_warm") + [[ "$role_string" =~ c ]] && roles+=("data_cold") + [[ "$role_string" =~ s ]] && roles+=("data_content") + [[ "$role_string" =~ f ]] && roles+=("data_frozen") + [[ "$role_string" =~ d ]] && roles+=("data") + + local IFS=',' + echo "${roles[*]}" +} + +run_indices_growth() { + if ! command -v so-elasticsearch-indices-growth >/dev/null 2>&1; then + return 1 + fi + + if [ "$EUID" -ne 0 ] && command -v sudo >/dev/null 2>&1; then + sudo -n so-elasticsearch-indices-growth 2>/dev/null || so-elasticsearch-indices-growth 2>/dev/null + else + so-elasticsearch-indices-growth 2>/dev/null + fi +} + +fetch_total_bytes() { + local start="$1" + local stop="$2" + local range_line + + if [ -n "$stop" ]; then + range_line=" |> range(start: ${start}, stop: ${stop})" + else + range_line=" |> range(start: ${start})" + fi + + local query + query=$(cat <<-EOF +from(bucket: "telegraf/so_long_term") +${range_line} + |> filter(fn: (r) => r._measurement == "elasticsearch_index_size") + |> last() + |> group() + |> sum() + |> keep(columns: ["_value"]) +EOF + ) + + local result value + result=$(run_flux_query "$query") + value=$(read_csv_value "$result") + normalize_number "$value" +} + +fail() { + if [ "$JSON_OUTPUT" = true ]; then + jq -n --arg error "$1" '{error: $error}' + else + echo "ERROR: $1" >&2 + fi + exit 1 +} + +echo -e "\nDISCLAIMER: Script output is based on current data patterns, but are approximations soley intended to assist with getting a general ILM policy configured." + +ORG_ID=$(lookup_org_id) +[ -n "$ORG_ID" ] || fail "Unable to resolve InfluxDB org id" + +cluster_storage_size=0 +indexed_storage_source="elasticsearch" +cluster_storage_size_output=$(so-elasticsearch-query '_cluster/stats?filter_path=indices.store.size_in_bytes' --fail 2>/dev/null || true) +if [ -n "$cluster_storage_size_output" ]; then + cluster_storage_size=$(echo "$cluster_storage_size_output" | jq -r '.indices.store.size_in_bytes // 0' 2>/dev/null) + if ! printf '%s' "$cluster_storage_size" | grep -Eq '^[0-9]+$'; then + cluster_storage_size=0 + fi +fi + +# historical data from influxdb for growth calculation +one_day_total=$(fetch_total_bytes "-25h" "-23h") +seven_day_total=$(fetch_total_bytes "-7d8h" "-7d") +thirty_day_total=$(fetch_total_bytes "-30d8h" "-30d") + +# available historical windows (prefer 30d/7d when available, to avoid using recent 24h traffic spike as true daily ingest rate) +history_days=0 +historical_total=0 + +if [ "$thirty_day_total" -gt 0 ]; then + history_days=30 + history_label="30-day" + historical_total=$thirty_day_total +elif [ "$seven_day_total" -gt 0 ]; then + history_days=7 + history_label="7-day" + historical_total=$seven_day_total +elif [ "$one_day_total" -gt 0 ]; then + history_days=1 + history_label="24-hour" + historical_total=$one_day_total +fi + +[ "$history_days" -gt 0 ] || fail "Historical InfluxDB data unavailable for growth calculation. If this a newer grid try re-running this script in a few days. Otherwise review /opt/so/log/telegraf/telegraf.log for errors with collecting required ES metrics." + +# Daily growth rate +growth_bytes=$(( cluster_storage_size - historical_total )) +daily_growth_bytes=$(awk -v diff="$growth_bytes" -v days="$history_days" 'BEGIN { + if (days <= 0) { print 0; exit } + printf "%.0f", diff / days +}') + +# Daily shard creation rate using same time window (30d / 7d / 24h) +daily_shard_creation=0 +now_ms=$(date +%s)000 +history_ago_ms=$(awk -v now="$now_ms" -v days="$history_days" 'BEGIN { printf "%.0f", now - (days * 86400 * 1000) }') +shard_creation_output=$(so-elasticsearch-query "_cat/indices/.ds-*?format=json&h=index,pri,rep,creation.date" --fail 2>/dev/null || true) +if [ -n "$shard_creation_output" ]; then + recent_shards=$(echo "$shard_creation_output" | jq --argjson cutoff "$history_ago_ms" ' + [.[] | + select(.["creation.date"] != null and (.["creation.date"] | tonumber) >= $cutoff) | + (.pri | tonumber) + ((.pri | tonumber) * (.rep | tonumber)) + ] | add // 0 + ' 2>/dev/null) + if [ -n "$recent_shards" ] && [[ "$recent_shards" =~ ^[0-9]+$ ]]; then + daily_shard_creation=$(awk -v total="$recent_shards" -v days="$history_days" 'BEGIN { + if (days <= 0) { print 0; exit } + printf "%.1f", total / days + }') + fi +fi + +# Find expected ILM deletions +ilm_delete_7d=0 +ilm_delete_30d=0 +ilm_indices_7d=0 +ilm_indices_30d=0 +ilm_delete_immediate=0 +ilm_indices_immediate=0 +ilm_delete_scheduled_7d=0 +ilm_indices_scheduled_7d=0 +ilm_delete_scheduled_30d=0 +ilm_indices_scheduled_30d=0 +ilm_shards_7d=0 +ilm_shards_30d=0 +ilm_shards_immediate=0 +ilm_shards_scheduled_7d=0 +ilm_shards_scheduled_30d=0 + + # For verbose output +declare -a scheduled_indices_names +declare -a scheduled_indices_sizes +declare -a scheduled_indices_days +declare -a immediate_indices_names +declare -a immediate_indices_sizes + +# Get ilm policy delete ages per policy + # example output 'so-logs-1password.audit_events-logs|365' +tmpfile_policies=$(create_temp_file) +so-elasticsearch-query '_ilm/policy' --fail 2>/dev/null | jq -r ' + def age_to_days: + if type == "number" then . + elif type == "string" then + (ascii_downcase) as $s | + (try ($s | capture("^(?-?[0-9.]+)(?[smhd]?)$")) catch {num:"0", unit:""}) as $m | + (($m.num | tonumber? // 0)) as $val | + (if $m.unit == "d" or $m.unit == "" then $val + elif $m.unit == "h" then $val / 24 + elif $m.unit == "m" then $val / 1440 + elif $m.unit == "s" then $val / 86400 + else $val end) + else 0 end; + to_entries[] | + select(.value.policy.phases.delete.min_age?) | + "\(.key)|\((.value.policy.phases.delete.min_age | age_to_days))" +' > "$tmpfile_policies" 2>/dev/null || true + +declare -A policy_ages + +if [ -s "$tmpfile_policies" ]; then + # create associative array of policy -> delete_age + while IFS='|' read -r policy age; do + policy_ages["$policy"]=$age + done < "$tmpfile_policies" + + # Get ILM managed indices with their age and policy, figure days until deletion + tmpfile_indices=$(create_temp_file) + so-elasticsearch-query '_all/_ilm/explain' --fail 2>/dev/null | jq -r ' + def age_to_days: + if type == "number" then . + elif type == "string" then + (ascii_downcase) as $s | + (try ($s | capture("^(?-?[0-9.]+)(?[smhd]?)$")) catch {num:"0", unit:""}) as $m | + (($m.num | tonumber? // 0)) as $val | + (if $m.unit == "d" or $m.unit == "" then $val + elif $m.unit == "h" then $val / 24 + elif $m.unit == "m" then $val / 1440 + elif $m.unit == "s" then $val / 86400 + else $val end) + else 0 end; + .indices | to_entries[] | + select(.value.managed == true and .value.policy) | + "\(.key)|\(.value.policy)|\(((.value.age? // "0") | age_to_days))|\(.value.phase? // "")" + ' > "$tmpfile_indices" 2>/dev/null || true + + # Process each index and calculate totals + tmpfile_all=$(create_temp_file) + while IFS='|' read -r index policy age phase; do + if [ -n "${policy_ages[$policy]:-}" ]; then + delete_age=${policy_ages[$policy]} + delete_age=${delete_age:-0} + age=${age:-0} + days_until_ceiling=$(awk -v del="$delete_age" -v aged="$age" 'BEGIN { + diff = del - aged; + if (diff <= 0) { + print 0; + exit + } + base = int(diff); + if (diff > base) { base = base + 1 } + print base; + }') + if [ -z "$days_until_ceiling" ]; then + days_until_ceiling=0 + fi + if [ "$days_until_ceiling" -lt 0 ]; then + days_until_ceiling=0 + fi + bucket="scheduled" + if [ "$phase" = "delete" ]; then + days_until_ceiling=0 + bucket="immediate" + fi + if [ "$days_until_ceiling" -le 30 ] 2>/dev/null; then + echo "$index|$days_until_ceiling|$bucket" >> "$tmpfile_all" + fi + fi + done < "$tmpfile_indices" + + # Get size and shard counts for indices + if [ -s "$tmpfile_all" ]; then + candidate_indices=$(cut -d'|' -f1 "$tmpfile_all" | tr '\n' ',' | sed 's/,$//') + if [ -n "$candidate_indices" ]; then + tmpfile_sizes=$(create_temp_file) + so-elasticsearch-query "_cat/indices/${candidate_indices}?format=json&h=index,pri.store.size,pri,rep&bytes=b" --fail 2>/dev/null | \ + jq -r '.[] | "\(.index)|\(.["pri.store.size"])|\(.pri)|\(.rep)"' > "$tmpfile_sizes" 2>/dev/null || true + + # Build size and shard lookup + declare -A index_sizes + declare -A index_shards + while IFS='|' read -r idx size pri rep; do + index_sizes["$idx"]=$size + # Total shards = pri + (pri * rep) + total_shards=$(awk -v p="$pri" -v r="$rep" 'BEGIN { printf "%.0f", p + (p * r) }') + index_shards["$idx"]=$total_shards + done < "$tmpfile_sizes" + + # Calculate totals for ilm deletes + while IFS='|' read -r index days_until bucket; do + size=${index_sizes[$index]:-0} + shards=${index_shards[$index]:-0} + if [ "$bucket" = "immediate" ]; then + ilm_delete_immediate=$((ilm_delete_immediate + size)) + ilm_indices_immediate=$((ilm_indices_immediate + 1)) + ilm_shards_immediate=$((ilm_shards_immediate + shards)) + if [ "$VERBOSE" = true ]; then + immediate_indices_names+=("$index") + immediate_indices_sizes+=("$size") + fi + else + if [ "$days_until" -le 7 ] 2>/dev/null; then + ilm_delete_scheduled_7d=$((ilm_delete_scheduled_7d + size)) + ilm_indices_scheduled_7d=$((ilm_indices_scheduled_7d + 1)) + ilm_shards_scheduled_7d=$((ilm_shards_scheduled_7d + shards)) + if [ "$VERBOSE" = true ]; then + scheduled_indices_names+=("$index") + scheduled_indices_sizes+=("$size") + scheduled_indices_days+=("$days_until") + fi + fi + ilm_delete_scheduled_30d=$((ilm_delete_scheduled_30d + size)) + ilm_indices_scheduled_30d=$((ilm_indices_scheduled_30d + 1)) + ilm_shards_scheduled_30d=$((ilm_shards_scheduled_30d + shards)) + fi + + if [ "$days_until" -le 7 ] 2>/dev/null; then + ilm_delete_7d=$((ilm_delete_7d + size)) + ilm_indices_7d=$((ilm_indices_7d + 1)) + ilm_shards_7d=$((ilm_shards_7d + shards)) + fi + ilm_delete_30d=$((ilm_delete_30d + size)) + ilm_indices_30d=$((ilm_indices_30d + 1)) + ilm_shards_30d=$((ilm_shards_30d + shards)) + done < "$tmpfile_all" + fi + fi +fi + +# Get the average daily ILM deletion rate (smooth out over 30d / 7d for consistency) +daily_ilm_delete_bytes=0 +if [ "$ilm_delete_scheduled_30d" -gt 0 ] && [ "$ilm_indices_scheduled_30d" -gt 0 ]; then + daily_ilm_delete_bytes=$(awk -v total="$ilm_delete_scheduled_30d" 'BEGIN { printf "%.0f", total / 30 }') +elif [ "$ilm_delete_scheduled_7d" -gt 0 ] && [ "$ilm_indices_scheduled_7d" -gt 0 ]; then + daily_ilm_delete_bytes=$(awk -v total="$ilm_delete_scheduled_7d" 'BEGIN { printf "%.0f", total / 7 }') +fi + +# Net storage growth (growth - deletions) +net_growth_bytes=$(awk -v growth="$daily_growth_bytes" -v deletions="$daily_ilm_delete_bytes" 'BEGIN { + printf "%.0f", growth - deletions +}') + +ilm_delete_7d_gb=$(bytes_to_gb "$ilm_delete_7d") +ilm_delete_30d_gb=$(bytes_to_gb "$ilm_delete_30d") +ilm_delete_immediate_gb=$(bytes_to_gb "$ilm_delete_immediate") +ilm_delete_scheduled_7d_gb=$(bytes_to_gb "$ilm_delete_scheduled_7d") +ilm_delete_scheduled_30d_gb=$(bytes_to_gb "$ilm_delete_scheduled_30d") +daily_ilm_delete_gb=$(bytes_to_gb "$daily_ilm_delete_bytes") + +ilm_impact_pct="0.0" +if [ "$cluster_storage_size" -gt 0 ] && [ "$ilm_delete_7d" -gt 0 ]; then + ilm_impact_pct=$(awk -v ilm="$ilm_delete_7d" -v total="$cluster_storage_size" 'BEGIN { + if (total <= 0) { printf "0.0"; exit } + printf "%.1f", (ilm / total) * 100 + }') +fi + +ilm_window_daily_bytes=0 +ilm_window_daily_gb="0.00" +if [ "$ilm_delete_7d" -gt 0 ]; then + ilm_window_daily_bytes=$(awk -v total="$ilm_delete_7d" 'BEGIN { printf "%.0f", total / 7 }') + ilm_window_daily_gb=$(awk -v total="$ilm_delete_7d" 'BEGIN { printf "%.2f", total / 7 / 1024 / 1024 / 1024 }') +fi + +ilm_rate_variance_pct="" +ilm_rate_variance_warning=false +if [ "$daily_ilm_delete_bytes" -gt 0 ] && [ "$ilm_window_daily_bytes" -gt 0 ]; then + ilm_rate_variance_pct=$(awk -v window="$ilm_window_daily_bytes" -v daily="$daily_ilm_delete_bytes" 'BEGIN { + if (daily == 0) { print ""; exit } + diff = window - daily; + if (diff < 0) diff = -diff; + pct = diff / daily * 100; + if (pct < 0) pct = -pct; + printf "%.0f", pct + }') + if [ -n "$ilm_rate_variance_pct" ]; then + ilm_rate_flag=$(awk -v v="$ilm_rate_variance_pct" 'BEGIN { if (v + 0 > 30) print 1; else print 0 }') + if [ "$ilm_rate_flag" -eq 1 ] 2>/dev/null; then + ilm_rate_variance_warning=true + fi + fi +fi + +ilm_rate_variance_warning_json="false" +if [ "$ilm_rate_variance_warning" = true ]; then + ilm_rate_variance_warning_json="true" +fi + +# Elasticsearch cluster disk watermark settings (fallback to 85/90/95 defaults) +watermark_output=$(so-elasticsearch-query '_cluster/settings?include_defaults=true&filter_path=*.cluster.routing.allocation.disk.*' --fail 2>/dev/null) || fail "Failed to query Elasticsearch cluster settings" + +low=$(echo "$watermark_output" | jq -r '.transient.cluster.routing.allocation.disk.watermark.low // .persistent.cluster.routing.allocation.disk.watermark.low // .defaults.cluster.routing.allocation.disk.watermark.low // empty') +high=$(echo "$watermark_output" | jq -r '.transient.cluster.routing.allocation.disk.watermark.high // .persistent.cluster.routing.allocation.disk.watermark.high // .defaults.cluster.routing.allocation.disk.watermark.high // empty') +flood=$(echo "$watermark_output" | jq -r '.transient.cluster.routing.allocation.disk.watermark.flood_stage // .persistent.cluster.routing.allocation.disk.watermark.flood_stage // .defaults.cluster.routing.allocation.disk.watermark.flood_stage // empty') + +low=${low:-"85%"} +high=${high:-"90%"} +flood=${flood:-"95%"} + +low_percent=${low%\%} +low_fraction=$(awk -v p="$low_percent" 'BEGIN { + if (p == "" || p + 0 <= 0) { printf "%.6f", 0.85; exit } + printf "%.6f", p / 100 +}') + +high_percent=${high%\%} +high_fraction=$(awk -v p="$high_percent" 'BEGIN { + if (p == "" || p + 0 <= 0) { printf "%.6f", 0.90; exit } + printf "%.6f", p / 100 +}') + +# Cluster shard total +cluster_shards_output=$(so-elasticsearch-query '_cluster/stats?filter_path=indices.shards.total' --fail 2>/dev/null) || fail "Failed to query cluster shard stats" +total_shards=$(echo "$cluster_shards_output" | jq -r '.indices.shards.total // 0' 2>/dev/null) + +# Get max shards per node setting (with default 1000) +max_shards_per_node_output=$(so-elasticsearch-query '_cluster/settings?include_defaults=true&filter_path=*.cluster.max_shards_per_node' --fail 2>/dev/null) || fail "Failed to query cluster shard settings" +max_shards_per_node=$(echo "$max_shards_per_node_output" | jq -r '.transient.cluster.max_shards_per_node // .persistent.cluster.max_shards_per_node // .defaults.cluster.max_shards_per_node // "1000"' 2>/dev/null) +max_shards_per_node=${max_shards_per_node:-1000} + +# Get same disk usage metric ES uses for watermark (not only ES used storage, but OS level storage usage) +nodes_output=$(so-elasticsearch-query '_cat/nodes?format=json&h=name,ip,node.role,disk.total,disk.used,disk.avail&bytes=b' --fail 2>/dev/null) || fail "Failed to query Elasticsearch node disk usage" + +# Parse nodes with data roles and calculate cluster totals +# Only include nodes with data roles: d=data, h=data_hot, w=data_warm, c=data_cold, s=data_content, f=data_frozen +cluster_stats=$(echo "$nodes_output" | jq --argjson low "$low_fraction" ' + [ .[] + | select(.["node.role"] | test("[dhwcsf]")) + | .total = (.["disk.total"] | tostring | gsub("[^0-9.]"; "") | tonumber) + | .used = (.["disk.used"] | tostring | gsub("[^0-9.]"; "") | tonumber) + | .avail = (.["disk.avail"] | tostring | gsub("[^0-9.]"; "") | tonumber) + | select(.total? and .used?) + | .low_threshold = (.total * $low) + | .remaining = (.low_threshold - .used) + ] + | { + total: ([.[].total] | add // 0), + used: ([.[].used] | add // 0), + low_threshold: ([.[].low_threshold] | add // 0), + remaining: ([.[].remaining] | add // 0) + } +') + +cluster_total=$(echo "$cluster_stats" | jq -r '.total') +cluster_used=$(echo "$cluster_stats" | jq -r '.used') +cluster_low_threshold=$(echo "$cluster_stats" | jq -r '.low_threshold') +cluster_remaining=$(echo "$cluster_stats" | jq -r '.remaining') + +cluster_high_threshold=$(awk -v total="$cluster_total" -v frac="$high_fraction" 'BEGIN { + if (total == "" || frac == "" || total + 0 <= 0 || frac + 0 <= 0) { printf "0"; exit } + printf "%.0f", total * frac +}') +cluster_over_low_bytes=$(awk -v used="$cluster_used" -v threshold="$cluster_low_threshold" 'BEGIN { + if (used == "" || threshold == "") { printf "0"; exit } + diff = used - threshold; + if (diff < 0) diff = 0; + printf "%.0f", diff +}') +cluster_over_high_bytes=$(awk -v used="$cluster_used" -v threshold="$cluster_high_threshold" 'BEGIN { + if (used == "" || threshold == "") { printf "0"; exit } + diff = used - threshold; + if (diff < 0) diff = 0; + printf "%.0f", diff +}') + +# Count data nodes and calculate shard capacity +# Only count nodes with data roles: d=data, h=data_hot, w=data_warm, c=data_cold, s=data_content f=data_frozen +data_node_count=$(echo "$nodes_output" | jq '[.[] | select(.["node.role"] | test("[dhwcsf]"))] | length') +max_shard_capacity=$((data_node_count * max_shards_per_node)) + +declare -a data_node_names +declare -a data_node_roles +if [ "$data_node_count" -gt 0 ]; then + while IFS='|' read -r node_name node_role; do + data_node_names+=("$node_name") + data_node_roles+=("$node_role") + done < <(echo "$nodes_output" | jq -r '.[] | select(.["node.role"] | test("[dhwcsf]")) | "\(.name)|\(.["node.role"])"') +fi +shard_usage_percent="0.0" +if [ "$max_shard_capacity" -gt 0 ]; then + shard_usage_percent=$(awk -v current="$total_shards" -v max="$max_shard_capacity" 'BEGIN { + if (max <= 0) { printf "0.0"; exit } + printf "%.1f", (current / max) * 100 + }') +fi + +recommendations_triggered=false +recommendations_ready=false +recommendations_message="" +recommendations_json='[]' +recommendations_triggered_json=false +recommendation_lines=() +recommendation_records=() +should_trigger_recommendations=false +recommendations_reason="" + +days_to_low_numeric="" +days_to_low_gross_numeric="" + +[ "$cluster_total" -gt 0 ] || fail "No Elasticsearch data nodes retrieved from _cat/nodes" + +# Calculate current retention period (age of oldest .ds-logs-* index) +oldest_index_days="" +oldest_index_name="" +oldest_index_output=$(so-elasticsearch-query '_cat/indices/.ds-logs-*?format=json&h=index,creation.date&s=creation.date:asc' --fail 2>/dev/null | jq -r '.[0] // empty' 2>/dev/null || true) +if [ -n "$oldest_index_output" ]; then + oldest_index_name=$(echo "$oldest_index_output" | jq -r '.index // empty' 2>/dev/null) + oldest_creation_ms=$(echo "$oldest_index_output" | jq -r '.["creation.date"] // empty' 2>/dev/null) + if [ -n "$oldest_creation_ms" ] && [[ "$oldest_creation_ms" =~ ^[0-9]+$ ]]; then + oldest_creation_sec=$((oldest_creation_ms / 1000)) + if [ "$oldest_creation_sec" -gt 0 ]; then + now_sec=$(date +%s) + if [ "$now_sec" -ge "$oldest_creation_sec" ]; then + age_sec=$((now_sec - oldest_creation_sec)) + oldest_index_days=$(awk -v age="$age_sec" 'BEGIN { printf "%.1f", age / 86400 }') + fi + fi + fi +fi + +# Calculate days until low watermark using net growth +days_to_low="" +days_to_low_gross="" +target_date="" + +# Calculate with gross growth +if [ "$daily_growth_bytes" -gt 0 ] && [ "$(echo "$cluster_remaining > 0" | bc -l 2>/dev/null || awk -v r="$cluster_remaining" 'BEGIN { if (r > 0) print 1; else print 0 }')" -eq 1 ]; then + days_to_low_gross=$(awk -v rem="$cluster_remaining" -v perday="$daily_growth_bytes" 'BEGIN { + printf "%.2f", rem / perday + }') +fi + +# Calculate with net growth (minus ILM deletions) +if [ "$net_growth_bytes" -gt 0 ] && [ "$(echo "$cluster_remaining > 0" | bc -l 2>/dev/null || awk -v r="$cluster_remaining" 'BEGIN { if (r > 0) print 1; else print 0 }')" -eq 1 ]; then + days_to_low=$(awk -v rem="$cluster_remaining" -v perday="$net_growth_bytes" 'BEGIN { + printf "%.2f", rem / perday + }') + ceil_days=$(awk -v d="$days_to_low" 'BEGIN { + base = int(d); + if (d > base) { base = base + 1 } + if (base < 0) { base = 0 } + printf "%d", base + }') + target_date=$(date -d "+${ceil_days} days" +%F 2>/dev/null) +elif [ "$(echo "$cluster_remaining > 0" | bc -l 2>/dev/null || awk -v r="$cluster_remaining" 'BEGIN { if (r > 0) print 1; else print 0 }')" -eq 1 ]; then + # Net growth is zero or negative, cluster is in equilibrium or shrinking + days_to_low="stable" +fi + +if [ -n "$days_to_low" ] && [ "$days_to_low" != "stable" ] && [[ "$days_to_low" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + days_to_low_numeric="$days_to_low" +fi + +if [ -n "$days_to_low_gross" ] && [[ "$days_to_low_gross" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then + days_to_low_gross_numeric="$days_to_low_gross" +fi + +# Calculate estimated retention (oldest index age + days until low watermark) +estimated_retention_days="" +if [ -n "$oldest_index_days" ] && [ -n "$days_to_low_numeric" ]; then + estimated_retention_days=$(awk -v oldest="$oldest_index_days" -v remaining="$days_to_low_numeric" 'BEGIN { + printf "%.1f", oldest + remaining + }') +fi + +cluster_at_or_below_low=$(echo "$cluster_remaining <= 0" | bc -l 2>/dev/null || awk -v r="$cluster_remaining" 'BEGIN { if (r <= 0) print 1; else print 0 }') + +if [ "$cluster_at_or_below_low" -eq 1 ]; then + should_trigger_recommendations=true + if [ "$cluster_over_high_bytes" -gt 0 ] 2>/dev/null; then + recommendations_reason="Cluster is beyond the high watermark threshold. Reduce retention on the fastest-growing indices immediately." + else + recommendations_reason="Cluster is at or beyond the low watermark threshold. Reduce retention on the fastest-growing indices immediately." + fi +elif [ -n "$days_to_low_numeric" ]; then + within_seven=$(awk -v d="$days_to_low_numeric" 'BEGIN { if (d <= 7) print 1; else print 0 }') + if [ "$within_seven" -eq 1 ]; then + should_trigger_recommendations=true + recommendations_reason="Projected low watermark breach in ~${days_to_low_numeric} days (${target_date:-N/A}). Reduce retention on the fastest-growing indices." + fi +elif [ -n "$days_to_low_gross_numeric" ]; then + within_seven_gross=$(awk -v d="$days_to_low_gross_numeric" 'BEGIN { if (d <= 7) print 1; else print 0 }') + if [ "$within_seven_gross" -eq 1 ]; then + should_trigger_recommendations=true + recommendations_reason="Gross growth trend indicates a low watermark breach in ~${days_to_low_gross_numeric} days (${target_date:-N/A}). Reduce retention on the fastest-growing indices before ILM deletions." + fi +fi + +cluster_over_high_flag=0 +if [ "$cluster_over_high_bytes" -gt 0 ] 2>/dev/null; then + cluster_over_high_flag=1 +fi + +cluster_over_low_flag=0 +if [ "$cluster_over_low_bytes" -gt 0 ] 2>/dev/null; then + cluster_over_low_flag=1 +fi + +cluster_high_threshold_gb=$(bytes_to_gb "$cluster_high_threshold") +cluster_over_low_gb=$(bytes_to_gb "$cluster_over_low_bytes") +cluster_over_high_gb=$(bytes_to_gb "$cluster_over_high_bytes") + +if [ "$should_trigger_recommendations" = true ]; then + recommendations_triggered=true + recommendations_triggered_json=true + if [ -n "$recommendations_reason" ]; then + recommendations_message="$recommendations_reason" + else + recommendations_message="Cluster is nearing the low watermark threshold. Reduce retention on the fastest-growing indices." + fi + + growth_output=$(run_indices_growth || true) + if [ -n "${growth_output//[[:space:]]/}" ]; then + mapfile -t recommendation_source_lines < <(printf '%s\n' "$growth_output" | tail -n +3 | awk 'NF' | head -n 3) + for line in "${recommendation_source_lines[@]}"; do + index=$(echo "$line" | awk '{print $1}') + [ -n "$index" ] || continue + + growth_24h_gb=$(echo "$line" | awk '{print $(NF-2)}') + + creation_date_display="" + retention_days="" + policy="" + delete_min_age="" + + index_info=$(so-elasticsearch-query "_cat/indices/${index}?format=json&h=index,creation.date,creation.date.string" --fail 2>/dev/null) || true + if [ -n "$index_info" ]; then + creation_epoch=$(echo "$index_info" | jq -r '.[0]."creation.date" // empty' 2>/dev/null) + creation_readable=$(echo "$index_info" | jq -r '.[0]."creation.date.string" // empty' 2>/dev/null) + if [ -n "$creation_epoch" ] && [[ "$creation_epoch" =~ ^[0-9]+$ ]]; then + creation_seconds=$((creation_epoch / 1000)) + if [ "$creation_seconds" -gt 0 ]; then + creation_date_display=$(date -u -d "@$creation_seconds" +%FT%TZ 2>/dev/null) + now_seconds=$(date +%s) + if [ "$now_seconds" -ge "$creation_seconds" ]; then + retention_days=$(awk -v now="$now_seconds" -v created="$creation_seconds" 'BEGIN { diff = now - created; if (diff < 0) diff = 0; printf "%.1f", diff / 86400 }') + fi + fi + fi + if [ -z "$creation_date_display" ] && [ -n "$creation_readable" ] && [ "$creation_readable" != "null" ]; then + creation_date_display="$creation_readable" + fi + fi + + ilm_output=$(so-elasticsearch-query "${index}/_ilm/explain" --fail 2>/dev/null) || true + if [ -n "$ilm_output" ]; then + policy=$(echo "$ilm_output" | jq -r ".indices.\"$index\".policy // empty" 2>/dev/null) + fi + if [ -n "$policy" ] && [ -n "${policy_ages[$policy]:-}" ]; then + delete_min_age=${policy_ages[$policy]} + fi + + retention_days_display=${retention_days:-unknown} + retention_days_floor="" + if [ -n "$retention_days" ]; then + retention_days_floor=$(awk -v v="$retention_days" 'BEGIN { if (v == "" || v == "null") { print ""; exit } val = v + 0; if (val < 1) val = 1; printf "%d", int(val) }') + if [ -n "$retention_days_floor" ] && [ "$retention_days_floor" -lt 1 ]; then + retention_days_floor=1 + fi + fi + + delete_min_age_numeric="" + if [ -n "$delete_min_age" ]; then + delete_min_age_numeric=$(awk -v v="$delete_min_age" 'BEGIN { if (v == "" || v == "null") { print ""; exit } val = v + 0; if (val < 1) val = 1; printf "%d", int(val) }') + fi + + recommended_delete_min_age="" + if [ -n "$retention_days_floor" ]; then + recommended_delete_min_age="$retention_days_floor" + fi + if [ -n "$delete_min_age_numeric" ]; then + if [ -n "$recommended_delete_min_age" ]; then + recommended_delete_min_age=$(awk -v rec="$recommended_delete_min_age" -v cur="$delete_min_age_numeric" 'BEGIN { rec += 0; cur += 0; if (cur < rec) printf "%d", cur; else printf "%d", rec }') + else + recommended_delete_min_age="$delete_min_age_numeric" + fi + fi + if [ -z "$recommended_delete_min_age" ] && [ -n "$retention_days_floor" ]; then + recommended_delete_min_age="$retention_days_floor" + fi + + action_phrase="" + if [ -n "$recommended_delete_min_age" ]; then + if [ -n "$delete_min_age_numeric" ] && [ "$recommended_delete_min_age" -lt "$delete_min_age_numeric" ]; then + action_phrase="Lower delete.min_age to ~${recommended_delete_min_age}d" + else + action_phrase="Cap delete.min_age at ~${recommended_delete_min_age}d" + fi + if [ -n "$retention_days_floor" ]; then + action_phrase="${action_phrase} (observed retention ~${retention_days_floor}d)" + fi + action_phrase="${action_phrase}; consider whether a tighter cap (e.g., 30d) fits requirements." + else + action_phrase="Review ILM delete.min_age for this index; consider more aggressive retention if throughput stays high." + fi + + policy_clause="" + if [ -n "$policy" ]; then + policy_clause=", policy ${policy}" + fi + if [ -n "$delete_min_age" ]; then + policy_clause="${policy_clause} (current delete.min_age ${delete_min_age}d)" + fi + + recommendation_lines+=(" - ${BOLD}${index}${NC}: ~${growth_24h_gb} GB growth in last 24h, retention ~${retention_days_display} days (created ${creation_date_display:-unknown})${policy_clause}. ${action_phrase}") + record=$(jq -nc \ + --arg index "$index" \ + --arg growth "$growth_24h_gb" \ + --arg retention "${retention_days:-}" \ + --arg created "${creation_date_display:-}" \ + --arg policy "$policy" \ + --arg delete_age "${delete_min_age:-}" \ + --arg suggested "${recommended_delete_min_age:-}" \ + --arg action "$action_phrase" \ + '{ + index: $index, + growth_gb_last_24h: (if ($growth | length) > 0 then ($growth | tonumber) else null end), + retention_days: (if ($retention | length) > 0 then ($retention | tonumber) else null end), + creation_date: (if ($created | length) > 0 then $created else null end), + ilm_policy: (if ($policy | length) > 0 then $policy else null end), + delete_min_age_days: (if ($delete_age | length) > 0 then ($delete_age | tonumber) else null end), + suggested_delete_min_age_days: (if ($suggested | length) > 0 then ($suggested | tonumber) else null end), + recommendation: (if ($action | length) > 0 then $action else null end) + }') + recommendation_records+=("$record") + done + fi + + if [ ${#recommendation_records[@]} -gt 0 ]; then + recommendations_ready=true + recommendations_json=$(printf '%s\n' "${recommendation_records[@]}" | jq -s '.') + else + if [ -n "$recommendations_reason" ]; then + recommendations_message="$recommendations_reason Unable to retrieve detailed growth data from so-elasticsearch-indices-growth." + else + recommendations_message="Unable to retrieve growth data from so-elasticsearch-indices-growth while near the low watermark threshold." + fi + fi +fi + +if [ "$JSON_OUTPUT" = true ]; then + jq -n \ + --arg indexed_storage_source "$indexed_storage_source" \ + --arg current_gb "$(bytes_to_gb "$cluster_storage_size")" \ + --arg oldest_index_days "$oldest_index_days" \ + --arg estimated_retention_days "$estimated_retention_days" \ + --arg daily_growth_gb "$(bytes_to_gb "$daily_growth_bytes")" \ + --arg daily_ilm_delete_gb "$daily_ilm_delete_gb" \ + --arg net_growth_gb "$(bytes_to_gb "$net_growth_bytes")" \ + --arg ilm_delete_7d_gb "$ilm_delete_7d_gb" \ + --arg ilm_delete_immediate_gb "$ilm_delete_immediate_gb" \ + --arg ilm_delete_scheduled_7d_gb "$ilm_delete_scheduled_7d_gb" \ + --arg ilm_delete_scheduled_30d_gb "$ilm_delete_scheduled_30d_gb" \ + --arg ilm_delete_30d_gb "$ilm_delete_30d_gb" \ + --arg ilm_window_daily_gb "$ilm_window_daily_gb" \ + --arg ilm_impact_pct "$ilm_impact_pct" \ + --arg ilm_rate_variance_pct "$ilm_rate_variance_pct" \ + --arg growth_window "$history_label" \ + --arg cluster_total_gb "$(bytes_to_gb "$cluster_total")" \ + --arg cluster_used_gb "$(bytes_to_gb "$cluster_used")" \ + --arg cluster_remaining_gb "$(bytes_to_gb "$cluster_remaining")" \ + --arg cluster_low_threshold_gb "$(bytes_to_gb "$cluster_low_threshold")" \ + --arg cluster_high_threshold_gb "$cluster_high_threshold_gb" \ + --arg cluster_over_low_gb "$cluster_over_low_gb" \ + --arg cluster_over_high_gb "$cluster_over_high_gb" \ + --arg shard_usage_percent "$shard_usage_percent" \ + --arg low_watermark "$low" \ + --arg high_watermark "$high" \ + --arg flood_watermark "$flood" \ + --arg days_to_low "${days_to_low:-null}" \ + --arg days_to_low_gross "${days_to_low_gross:-null}" \ + --arg estimated_date "${target_date:-null}" \ + --arg recommendation_message "$recommendations_message" \ + --argjson total_shards "$total_shards" \ + --argjson max_shard_capacity "$max_shard_capacity" \ + --argjson data_node_count "$data_node_count" \ + --argjson max_shards_per_node "$max_shards_per_node" \ + --argjson ilm_indices_7d "$ilm_indices_7d" \ + --argjson ilm_indices_immediate "$ilm_indices_immediate" \ + --argjson ilm_indices_scheduled_7d "$ilm_indices_scheduled_7d" \ + --argjson ilm_indices_scheduled_30d "$ilm_indices_scheduled_30d" \ + --argjson ilm_indices_30d "$ilm_indices_30d" \ + --argjson ilm_shards_7d "$ilm_shards_7d" \ + --argjson ilm_shards_30d "$ilm_shards_30d" \ + --argjson ilm_shards_immediate "$ilm_shards_immediate" \ + --argjson ilm_shards_scheduled_7d "$ilm_shards_scheduled_7d" \ + --argjson ilm_shards_scheduled_30d "$ilm_shards_scheduled_30d" \ + --arg daily_shard_creation "$daily_shard_creation" \ + --argjson recommendations "$recommendations_json" \ + --argjson recommendations_triggered "$recommendations_triggered_json" \ + ' { + indexed_storage_gb: ($current_gb | tonumber), + indexed_storage_source: $indexed_storage_source, + oldest_index_days: (if ($oldest_index_days | length) > 0 then ($oldest_index_days | tonumber) else null end), + estimated_retention_days: (if ($estimated_retention_days | length) > 0 then ($estimated_retention_days | tonumber) else null end), + growth: { + daily_growth_gb: ($daily_growth_gb | tonumber), + daily_ilm_delete_gb: (if ($daily_ilm_delete_gb | length) > 0 then ($daily_ilm_delete_gb | tonumber) else null end), + net_growth_gb: (if ($net_growth_gb | length) > 0 then ($net_growth_gb | tonumber) else null end), + daily_shard_creation: (if ($daily_shard_creation | length) > 0 then ($daily_shard_creation | tonumber) else null end), + }, + ilm: { + deleting_now: { + indices: $ilm_indices_immediate, + storage_gb: (if ($ilm_delete_immediate_gb | length) > 0 then ($ilm_delete_immediate_gb | tonumber) else null end), + shards: $ilm_shards_immediate + }, + scheduled_7d: { + indices: $ilm_indices_scheduled_7d, + storage_gb: (if ($ilm_delete_scheduled_7d_gb | length) > 0 then ($ilm_delete_scheduled_7d_gb | tonumber) else null end), + shards: $ilm_shards_scheduled_7d + }, + scheduled_30d: { + indices: $ilm_indices_scheduled_30d, + storage_gb: (if ($ilm_delete_scheduled_30d_gb | length) > 0 then ($ilm_delete_scheduled_30d_gb | tonumber) else null end), + shards: $ilm_shards_scheduled_30d + }, + indices_to_delete_7d: $ilm_indices_7d, + storage_to_delete_7d_gb: (if ($ilm_delete_7d_gb | length) > 0 then ($ilm_delete_7d_gb | tonumber) else null end), + shards_to_delete_7d: $ilm_shards_7d, + total_30d_indices: $ilm_indices_30d, + total_30d_storage_gb: (if ($ilm_delete_30d_gb | length) > 0 then ($ilm_delete_30d_gb | tonumber) else null end), + total_30d_shards: $ilm_shards_30d, + percent_of_current_data: (if ($ilm_impact_pct | length) > 0 then ($ilm_impact_pct | tonumber) else null end), + windowed_daily_avg_gb: (if ($ilm_window_daily_gb | length) > 0 then ($ilm_window_daily_gb | tonumber) else null end), + }, + cluster: { + total_gb: ($cluster_total_gb | tonumber), + used_gb: ($cluster_used_gb | tonumber), + remaining_before_low_watermark_gb: (if ($cluster_remaining_gb | length) > 0 then ($cluster_remaining_gb | tonumber) else null end), + low_watermark_threshold_gb: (if ($cluster_low_threshold_gb | length) > 0 then ($cluster_low_threshold_gb | tonumber) else null end), + high_watermark_threshold_gb: (if ($cluster_high_threshold_gb | length) > 0 then ($cluster_high_threshold_gb | tonumber) else null end), + over_low_watermark_gb: (if ($cluster_over_low_gb | length) > 0 then ($cluster_over_low_gb | tonumber) else null end), + over_high_watermark_gb: (if ($cluster_over_high_gb | length) > 0 then ($cluster_over_high_gb | tonumber) else null end), + low_watermark_setting: $low_watermark, + high_watermark_setting: $high_watermark, + flood_watermark_setting: $flood_watermark, + shards: { + current: $total_shards, + max_capacity: $max_shard_capacity, + usage_percent: (if ($shard_usage_percent | length) > 0 then ($shard_usage_percent | tonumber) else null end), + data_nodes: $data_node_count, + max_shards_per_node: $max_shards_per_node + } + }, + projection: { + days_to_low_watermark_net: (if $days_to_low == "null" or $days_to_low == "stable" then $days_to_low else ($days_to_low | tonumber) end), + days_to_low_watermark_gross: (if $days_to_low_gross == "null" then null else ($days_to_low_gross | tonumber) end), + estimated_breach_date: (if $estimated_date == "null" then null else $estimated_date end) + }, + recommendations: { + triggered: $recommendations_triggered, + message: (if ($recommendation_message | length) > 0 then $recommendation_message else null end), + indices: $recommendations + } + }' +else + log_title "LOG" "Storage Overview" + + indexed_gb_display=$(bytes_to_gb "$cluster_storage_size") + echo -e "${BOLD}Indexed data size:${NC} ${indexed_gb_display} GB (Elasticsearch)" + echo -e "${BOLD}Cluster capacity:${NC} $(bytes_to_gb "$cluster_total") GB total" + echo -e "${BOLD}Cluster used:${NC} $(bytes_to_gb "$cluster_used") GB" + echo -e "${BOLD}Low watermark:${NC} $low ($(bytes_to_gb "$cluster_low_threshold") GB threshold)" + if [ "$cluster_over_low_flag" -eq 1 ]; then + if [ "$cluster_over_high_flag" -eq 1 ]; then + echo -e "${BOLD}Remaining space:${NC} ${REDBOLD}${cluster_over_high_gb} GB${NC} OVER the high watermark" + else + echo -e "${BOLD}Remaining space:${NC} ${YELLOWBOLD}${cluster_over_low_gb} GB${NC} OVER the low watermark" + fi + else + echo -e "${BOLD}Remaining space:${NC} $(bytes_to_gb "$cluster_remaining") GB before low watermark" + fi + + # Display shard capacity information + shard_warning_flag=$(awk -v pct="$shard_usage_percent" 'BEGIN { if (pct + 0 >= 80) print 1; else print 0 }') + if [ "$shard_warning_flag" -eq 1 ]; then + echo -e "${BOLD}Cluster shards:${NC} ${YELLOW}${total_shards} / ${max_shard_capacity} (${shard_usage_percent}%)${NC}" + else + echo -e "${BOLD}Cluster shards:${NC} ${total_shards} / ${max_shard_capacity} (${shard_usage_percent}%)" + fi + + # Display data nodes with roles (only data-related roles) + if [ "$data_node_count" -gt 0 ]; then + echo -e "${BOLD}Cluster data nodes:${NC} ${data_node_count}" + for i in "${!data_node_names[@]}"; do + node_name="${data_node_names[$i]}" + node_role="${data_node_roles[$i]}" + expanded_roles=$(expand_node_roles "$node_role") + echo -e " ${node_name}: ${expanded_roles}" + done + fi + + log_title "LOG" "ES Growth" + + echo -e "${BOLD}Daily growth rate:${NC} $(bytes_to_gb "$daily_growth_bytes") GB/day" + + if [ "$daily_ilm_delete_bytes" -gt 0 ]; then + echo -e "${BOLD}ILM deletion rate:${NC} ${daily_ilm_delete_gb} GB/day (scheduled)" + echo -e "${BOLD}Net growth rate:${NC} $(bytes_to_gb "$net_growth_bytes") GB/day" + else + echo -e "${BOLD}ILM deletion rate:${NC} 0.00 GB/day (scheduled)" + echo -e "${BOLD}Net growth rate:${NC} $(bytes_to_gb "$net_growth_bytes") GB/day" + fi + + # Display daily shards + if [ -n "$daily_shard_creation" ] && [ "$(awk -v d="$daily_shard_creation" 'BEGIN { if (d > 0) print 1; else print 0 }')" -eq 1 ]; then + daily_shard_creation_rounded=$(awk -v d="$daily_shard_creation" 'BEGIN { printf "%.0f", d }') + echo -e "${BOLD}Daily shard creation:${NC} ~${daily_shard_creation_rounded} shards/day" + fi + + if [ "$ilm_indices_immediate" -gt 0 ]; then + echo -e "${BOLD}Deleting now:${NC} $ilm_indices_immediate indices (~${ilm_delete_immediate_gb} GB, $ilm_shards_immediate shards)" + fi + if [ "$ilm_indices_7d" -gt 0 ]; then + echo -e "${BOLD}Storage to be freed (7d):${NC} $ilm_indices_7d indices (~${ilm_delete_7d_gb} GB, $ilm_shards_7d shards)" + fi + + log_title "LOG" "Retention Projection" + + if [ -n "$oldest_index_days" ]; then + oldest_days_rounded=$(awk -v d="$oldest_index_days" 'BEGIN { printf "%.0f", d }') + if [ -n "$oldest_index_name" ]; then + echo -e "${BOLD}Oldest index:${NC} ~${oldest_days_rounded} days (${oldest_index_name})" + else + echo -e "${BOLD}Oldest index:${NC} ~${oldest_days_rounded} days (.ds-logs-* only)" + fi + + if [ -n "$estimated_retention_days" ]; then + estimated_days_rounded=$(awk -v d="$estimated_retention_days" 'BEGIN { printf "%.0f", d }') + echo -e "${BOLD}Estimated retention:${NC} ~${estimated_days_rounded} days (until configured low watermark setting)" + fi + echo + fi + + if [ "$days_to_low" = "stable" ]; then + if [ "$net_growth_bytes" -lt 0 ]; then + shrink_rate_gb=$(bytes_to_gb "${net_growth_bytes#-}") + log_title "OK" "Cluster is shrinking - ILM deletions exceed growth" + echo + echo -e "${BOLD}Storage trend:${NC} Decreasing at ~${shrink_rate_gb} GB/day" + echo -e "${BOLD}Note:${NC} Current ILM policies are reclaiming more space than incoming data consumes." + if [ "$cluster_over_low_bytes" -gt 0 ] 2>/dev/null; then + recovery_days=$(awk -v excess="$cluster_over_low_bytes" -v rate="${net_growth_bytes#-}" 'BEGIN { + if (rate <= 0) { print ""; exit } + printf "%.1f", excess / rate + }') + if [ -n "$recovery_days" ]; then + echo -e "${BOLD}Recovery time:${NC} Estimated ${recovery_days} days to fall below the low watermark if trend continues" + fi + fi + else + log_title "OK" "Cluster is in equilibrium - ILM deletions balance growth" + echo + echo -e "${BOLD}Storage trend:${NC} Stable (net growth ~0 GB/day)" + echo -e "${BOLD}Note:${NC} Current ILM policies are keeping storage steady." + fi + elif [ -z "$days_to_low" ]; then + if [ "$net_growth_bytes" -lt 0 ] && [ "$daily_ilm_delete_bytes" -gt 0 ]; then + shrink_rate_gb=$(bytes_to_gb "${net_growth_bytes#-}") + log_title "OK" "Cluster is shrinking - ILM deletions exceed growth" + echo + echo -e "${BOLD}Storage trend:${NC} Decreasing at ~${shrink_rate_gb} GB/day" + echo -e "${BOLD}Note:${NC} Storage is expected to continue decreasing due to ILM policies." + elif [ "$daily_growth_bytes" -le 0 ]; then + log_title "WARN" "Unable to project: Growth rate is zero or negative" + elif [ "$(echo "$cluster_remaining <= 0" | bc -l 2>/dev/null || awk -v r="$cluster_remaining" 'BEGIN { if (r <= 0) print 1; else print 0 }')" -eq 1 ]; then + log_title "ERROR" "Cluster already at low watermark threshold! Review recommendations below and consider updating ILM." + else + log_title "WARN" "Unable to calculate projection" + fi + else + if (( $(echo "$days_to_low < 7" | bc -l 2>/dev/null || awk -v d="$days_to_low" 'BEGIN { if (d < 7) print 1; else print 0 }') )); then + log_title "ERROR" "Low watermark breach estimated in ~$days_to_low days (${target_date:-N/A})" + elif (( $(echo "$days_to_low < 14" | bc -l 2>/dev/null || awk -v d="$days_to_low" 'BEGIN { if (d < 14) print 1; else print 0 }') )); then + log_title "WARN" "Low watermark breach estimated in ~$days_to_low days (${target_date:-N/A})" + else + log_title "OK" "Low watermark breach estimated in ~$days_to_low days (${target_date:-N/A})" + fi + echo + fi + + if [ "$recommendations_triggered" = true ]; then + log_title "LOG" "Recommendations" + if [ "$recommendations_ready" = true ]; then + echo -e "${BOLD}Action:${NC} Reduce retention on the fastest-growing indices to reduce overall storage usage." + for rec_line in "${recommendation_lines[@]}"; do + echo -e "$rec_line" + done + else + if [ -n "$recommendations_message" ]; then + echo -e "${BOLD}Note:${NC} $recommendations_message" + fi + fi + echo + fi + + if [ "$VERBOSE" = true ]; then + log_title "LOG" "Scheduled Deletions (Detailed)" + + if [ ${#immediate_indices_names[@]} -gt 0 ]; then + echo -e "${BOLD}Deleting Now (in delete phase):${NC}" + echo + total_immediate_mb=0 + for i in "${!immediate_indices_names[@]}"; do + index_name="${immediate_indices_names[$i]}" + size_bytes="${immediate_indices_sizes[$i]}" + size_mb=$(awk -v b="$size_bytes" 'BEGIN { printf "%.2f", b / 1024 / 1024 }') + total_immediate_mb=$(awk -v total="$total_immediate_mb" -v size="$size_mb" 'BEGIN { printf "%.2f", total + size }') + printf " %-60s %10s MB\n" "$index_name" "$size_mb" + done + echo -e "${BOLD}Total:${NC} ${total_immediate_mb} MB (${#immediate_indices_names[@]} indices)" + echo + fi + + if [ ${#scheduled_indices_names[@]} -gt 0 ]; then + echo -e "${BOLD}Scheduled for Deletion (≤7 days):${NC}" + echo + total_scheduled_mb=0 + # Sort by days_until deletion + sorted_indices=() + for i in "${!scheduled_indices_names[@]}"; do + sorted_indices+=("${scheduled_indices_days[$i]}|${scheduled_indices_names[$i]}|${scheduled_indices_sizes[$i]}") + done + IFS=$'\n' sorted_indices=($(sort -t'|' -k1 -n <<<"${sorted_indices[*]}")) + unset IFS + + for entry in "${sorted_indices[@]}"; do + IFS='|' read -r days_until index_name size_bytes <<< "$entry" + size_mb=$(awk -v b="$size_bytes" 'BEGIN { printf "%.2f", b / 1024 / 1024 }') + total_scheduled_mb=$(awk -v total="$total_scheduled_mb" -v size="$size_mb" 'BEGIN { printf "%.2f", total + size }') + days_display=$(awk -v d="$days_until" 'BEGIN { printf "%.1f", d }') + printf " %-55s %10s MB (in ~%s days)\n" "$index_name" "$size_mb" "$days_display" + done + echo -e "${BOLD}Total:${NC} ${total_scheduled_mb} MB (${#scheduled_indices_names[@]} indices)" + echo + fi + + if [ ${#immediate_indices_names[@]} -eq 0 ] && [ ${#scheduled_indices_names[@]} -eq 0 ]; then + echo -e "No indices scheduled for deletion within the next 7 days." + echo + fi + fi + echo +fi + +exit 0 \ No newline at end of file