Files
securityonion/salt/elasticsearch/tools/sbin/so-elasticsearch-troubleshoot
2025-07-29 14:15:43 -05:00

196 lines
7.6 KiB
Bash

#!/bin/bash
. /usr/sbin/so-common
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
BOLD='\033[1;37m'
NC='\033[0m'
log_title() {
if [ $1 == "LOG" ]; then
echo -e "\n${BOLD}================ $2 ================${NC}\n"
elif [ $1 == "OK" ]; then
echo -e "${GREEN} $2 ${NC}"
elif [ $1 == "WARN" ]; then
echo -e "${YELLOW} $2 ${NC}"
elif [ $1 == "ERROR" ]; then
echo -e "${RED} $2 ${NC}"
fi
}
health_report() {
if ! health_report_output=$(so-elasticsearch-query _health_report?format=json --fail 2>/dev/null); then
log_title "ERROR" "Failed to retrieve health report from Elasticsearch"
return 1
fi
non_green_count=$(echo "$health_report_output" | jq '[.indicators | to_entries[] | select(.value.status != "green")] | length')
if [ "$non_green_count" -gt 0 ]; then
echo "$health_report_output" | jq -r '.indicators | to_entries[] | select(.value.status != "green") | .key' | while read -r indicator_name; do
indicator=$(echo "$health_report_output" | jq -r ".indicators.\"$indicator_name\"")
status=$(echo "$indicator" | jq -r '.status')
symptom=$(echo "$indicator" | jq -r '.symptom // "No symptom available"')
# reormat indicator name
display_name=$(echo "$indicator_name" | tr '_' ' ' | sed 's/\b\(.\)/\u\1/g')
if [ "$status" = "yellow" ]; then
log_title "WARN" "$display_name: $symptom"
else
log_title "ERROR" "$display_name: $symptom"
fi
# diagnosis if available
echo "$indicator" | jq -c '.diagnosis[]? // empty' | while read -r diagnosis; do
cause=$(echo "$diagnosis" | jq -r '.cause // "Unknown"')
action=$(echo "$diagnosis" | jq -r '.action // "No action specified"')
echo -e " ${BOLD}Cause:${NC} $cause\n"
echo -e " ${BOLD}Action:${NC} $action\n"
# Check for affected indices
affected_indices=$(echo "$diagnosis" | jq -r '.affected_resources.indices[]? // empty')
if [ -n "$affected_indices" ]; then
echo -e " ${BOLD}Affected indices:${NC}"
total_indices=$(echo "$affected_indices" | wc -l)
echo "$affected_indices" | head -10 | while read -r index; do
echo " - $index"
done
if [ "$total_indices" -gt 10 ]; then
remaining=$((total_indices - 10))
echo " ... and $remaining more indices (truncated for readability)"
fi
fi
echo
done
done
else
log_title "OK" "All health indicators are green"
fi
}
elasticsearch_status() {
log_title "LOG" "Elasticsearch Status"
if so-elasticsearch-query / --fail --output /dev/null; then
health_report
else
log_title "ERROR" "Elasticsearch API is not accessible"
so-status
log_title "ERROR" "Make sure Elasticsearch is running. Addtionally, check for startup errors in /opt/so/log/elasticsearch/securityonion.log${NC}\n"
exit 1
fi
}
indices_by_age() {
log_title "LOG" "Indices by Creation Date - Size > 1KB"
log_title "WARN" "Since high/flood watermark has been reached consider updating ILM policies.\n"
if ! indices_output=$(so-elasticsearch-query '_cat/indices?v&s=creation.date:asc&h=creation.date.string,index,status,health,docs.count,pri.store.size&bytes=b&format=json' --fail 2>/dev/null); then
log_title "ERROR" "Failed to retrieve indices list from Elasticsearch"
return 1
fi
# Filter for indices with size > 1KB (1024 bytes) and format output
echo -e "${BOLD}Creation Date Name Size${NC}"
echo -e "${BOLD}--------------------------------------------------------------------------------------------------------------${NC}"
# Create list of indices excluding .internal, so-detection*, so-case*
echo "$indices_output" | jq -r '.[] | select((."pri.store.size" | tonumber) > 1024) | select(.index | (startswith(".internal") or startswith("so-detection") or startswith("so-case")) | not ) | "\(."creation.date.string") | \(.index) | \(."pri.store.size")"' | while IFS='|' read -r creation_date index_name size_bytes; do
# Convert bytes to GB / MB
if [ "$size_bytes" -gt 1073741824 ]; then
size_human=$(echo "scale=2; $size_bytes / 1073741824" | bc)GB
else
size_human=$(echo "scale=2; $size_bytes / 1048576" | bc)MB
fi
creation_date=$(date -d "$creation_date" '+%Y-%m-%dT%H:%MZ' )
# Format output with spacing
printf "%-19s %-76s %10s\n" "$creation_date" "$index_name" "$size_human"
done
}
watermark_settings() {
watermark_path=".defaults.cluster.routing.allocation.disk.watermark"
if ! watermark_output=$(so-elasticsearch-query _cluster/settings?include_defaults=true\&filter_path=*.cluster.routing.allocation.disk.* --fail 2>/dev/null); then
log_title "ERROR" "Failed to retrieve watermark settings from Elasticsearch"
return 1
fi
if ! disk_allocation_output=$(so-elasticsearch-query _cat/nodes?v\&h=name,ip,disk.used_percent,disk.avail,disk.total,node.role\&format=json --fail 2>/dev/null); then
log_title "ERROR" "Failed to retrieve disk allocation data from Elasticsearch"
return 1
fi
flood=$(echo $watermark_output | jq -r "$watermark_path.flood_stage" )
high=$(echo $watermark_output | jq -r "$watermark_path.high" )
low=$(echo $watermark_output | jq -r "$watermark_path.low" )
# Strip percentage signs for comparison
flood_num=${flood%\%}
high_num=${high%\%}
low_num=${low%\%}
# Check each nodes disk usage
log_title "LOG" "Disk Usage Check"
echo -e "${BOLD}LOW:${GREEN}$low${NC}${BOLD} HIGH:${YELLOW}${high}${NC}${BOLD} FLOOD:${RED}${flood}${NC}\n"
# Only show data nodes (d=data, h=hot, w=warm, c=cold, f=frozen, s=content)
echo "$disk_allocation_output" | jq -r '.[] | select(.["node.role"] | test("[dhwcfs]")) | "\(.name)|\(.["disk.used_percent"])"' | while IFS='|' read -r node_name disk_used; do
disk_used_num=$(echo $disk_used | bc)
if (( $(echo "$disk_used_num >= $flood_num" | bc -l) )); then
log_title "ERROR" "$node_name is at or above the flood watermark ($flood)! Disk usage: ${disk_used}%"
touch /tmp/watermark_reached
elif (( $(echo "$disk_used_num >= $high_num" | bc -l) )); then
log_title "ERROR" "$node_name is at or above the high watermark ($high)! Disk usage: ${disk_used}%"
touch /tmp/watermark_reached
else
log_title "OK" "$node_name disk usage: ${disk_used}%"
fi
done
# Check if we need to show indices by age
if [ -f /tmp/watermark_reached ]; then
indices_by_age
rm -f /tmp/watermark_reached
fi
}
unassigned_shards() {
if ! unassigned_shards_output=$(so-elasticsearch-query _cat/shards?v\&h=index,shard,prirep,state,unassigned.reason,unassigned.details\&s=state\&format=json --fail 2>/dev/null); then
log_title "ERROR" "Failed to retrieve shard data from Elasticsearch"
return 1
fi
log_title "LOG" "Unassigned Shards Check"
# Check if there are any UNASSIGNED shards
unassigned_count=$(echo "$unassigned_shards_output" | jq '[.[] | select(.state == "UNASSIGNED")] | length')
if [ "$unassigned_count" -gt 0 ]; then
echo "$unassigned_shards_output" | jq -r '.[] | select(.state == "UNASSIGNED") | "\(.index)|\(.shard)|\(.prirep)|\(."unassigned.reason")"' | while IFS='|' read -r index shard prirep reason; do
if [ "$prirep" = "r" ]; then
log_title "WARN" "Replica shard for index $index is unassigned. Reason: $reason"
elif [ "$prirep" = "p" ]; then
log_title "ERROR" "Primary shard for index $index is unassigned. Reason: $reason"
fi
done
else
log_title "OK" "All shards are assigned"
fi
}
main() {
elasticsearch_status
watermark_settings
unassigned_shards
}
main