From 5c63111002862a581e429efd3035ac55b37c5673 Mon Sep 17 00:00:00 2001 From: Josh Patterson Date: Fri, 16 Jan 2026 16:42:24 -0500 Subject: [PATCH] add timing to scripts to allow for debugging delays --- .../so-elasticsearch-ilm-policy-load | 116 +++++++++++++++ .../so-elasticsearch-templates-load | 138 ++++++++++++++++++ 2 files changed, 254 insertions(+) diff --git a/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-ilm-policy-load b/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-ilm-policy-load index 04a7a8ab0..13123020c 100755 --- a/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-ilm-policy-load +++ b/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-ilm-policy-load @@ -4,32 +4,148 @@ # https://securityonion.net/license; you may not use this file except in compliance with the # Elastic License 2.0. +# Logging configuration +LOG_FILE="/opt/so/log/elasticsearch/so-elasticsearch-ilm-policy-load.log" +LOG_DIR=$(dirname "$LOG_FILE") + +# Ensure log directory exists +if [[ ! -d "$LOG_DIR" ]]; then + mkdir -p "$LOG_DIR" +fi + +# Timing variables +SCRIPT_START_TIME=$(date +%s%3N) +LAST_CHECKPOINT_TIME=$SCRIPT_START_TIME + +# Get elapsed time in milliseconds since script start +get_elapsed_ms() { + local now=$(date +%s%3N) + echo $((now - SCRIPT_START_TIME)) +} + +# Get time since last checkpoint and update checkpoint +get_checkpoint_ms() { + local now=$(date +%s%3N) + local elapsed=$((now - LAST_CHECKPOINT_TIME)) + LAST_CHECKPOINT_TIME=$now + echo $elapsed +} + +# Logging function with timing +log() { + local level=$1 + shift + local message="$*" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + local elapsed_ms=$(get_elapsed_ms) + local elapsed_sec=$(echo "scale=3; $elapsed_ms / 1000" | bc) + echo "[$timestamp] [+${elapsed_sec}s] [$level] $message" | tee -a "$LOG_FILE" +} + +log_info() { + log "INFO" "$@" +} + +log_warn() { + log "WARN" "$@" +} + +log_error() { + log "ERROR" "$@" +} + +# Log with checkpoint timing (shows time since last checkpoint) +log_timing() { + local checkpoint_ms=$(get_checkpoint_ms) + local checkpoint_sec=$(echo "scale=3; $checkpoint_ms / 1000" | bc) + local elapsed_ms=$(get_elapsed_ms) + local elapsed_sec=$(echo "scale=3; $elapsed_ms / 1000" | bc) + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] [+${elapsed_sec}s] [TIMING] $* (took ${checkpoint_sec}s)" | tee -a "$LOG_FILE" +} + +log_info "========== Starting Elasticsearch ILM policy load ==========" + +LAST_CHECKPOINT_TIME=$(date +%s%3N) . /usr/sbin/so-common +log_timing "Sourced so-common" {%- from 'elasticsearch/template.map.jinja' import ES_INDEX_SETTINGS %} +policy_count=0 +total_policy_time_ms=0 + {%- for index, settings in ES_INDEX_SETTINGS.items() %} {%- if settings.policy is defined %} {%- if index == 'so-logs-detections.alerts' %} echo echo "Setting up so-logs-detections.alerts-so policy..." + log_info "Setting up so-logs-detections.alerts-so policy..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) + policy_start=$(date +%s%3N) curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-so" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }' + policy_end=$(date +%s%3N) + policy_duration=$((policy_end - policy_start)) + total_policy_time_ms=$((total_policy_time_ms + policy_duration)) + policy_count=$((policy_count + 1)) + log_timing "Completed so-logs-detections.alerts-so policy (${policy_duration}ms)" echo {%- elif index == 'so-logs-soc' %} echo echo "Setting up so-soc-logs policy..." + log_info "Setting up so-soc-logs policy..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) + policy_start=$(date +%s%3N) curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/so-soc-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }' + policy_end=$(date +%s%3N) + policy_duration=$((policy_end - policy_start)) + total_policy_time_ms=$((total_policy_time_ms + policy_duration)) + policy_count=$((policy_count + 1)) + log_timing "Completed so-soc-logs policy (${policy_duration}ms)" echo echo echo "Setting up {{ index }}-logs policy..." + log_info "Setting up {{ index }}-logs policy..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) + policy_start=$(date +%s%3N) curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }' + policy_end=$(date +%s%3N) + policy_duration=$((policy_end - policy_start)) + total_policy_time_ms=$((total_policy_time_ms + policy_duration)) + policy_count=$((policy_count + 1)) + log_timing "Completed {{ index }}-logs policy (${policy_duration}ms)" echo {%- else %} echo echo "Setting up {{ index }}-logs policy..." + log_info "Setting up {{ index }}-logs policy..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) + policy_start=$(date +%s%3N) curl -K /opt/so/conf/elasticsearch/curl.config -b "sid=$SESSIONCOOKIE" -s -k -L -X PUT "https://localhost:9200/_ilm/policy/{{ index }}-logs" -H 'Content-Type: application/json' -d'{ "policy": {{ settings.policy | tojson(true) }} }' + policy_end=$(date +%s%3N) + policy_duration=$((policy_end - policy_start)) + total_policy_time_ms=$((total_policy_time_ms + policy_duration)) + policy_count=$((policy_count + 1)) + log_timing "Completed {{ index }}-logs policy (${policy_duration}ms)" echo {%- endif %} {%- endif %} {%- endfor %} echo + +# Final timing summary +total_elapsed_ms=$(get_elapsed_ms) +total_elapsed_sec=$(echo "scale=3; $total_elapsed_ms / 1000" | bc) +avg_policy_ms=0 +if [[ $policy_count -gt 0 ]]; then + avg_policy_ms=$((total_policy_time_ms / policy_count)) +fi + +log_info "========== TIMING SUMMARY ==========" +log_info "Total policies processed: $policy_count" +log_info "Total policy load time: ${total_policy_time_ms}ms" +log_info "Average time per policy: ${avg_policy_ms}ms" +log_info "Total script time: ${total_elapsed_sec}s" +log_info "====================================" + +log_info "========== Elasticsearch ILM policy load complete (${total_elapsed_sec}s) ==========" diff --git a/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-templates-load b/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-templates-load index 4ac1b4d5f..e274dbc90 100755 --- a/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-templates-load +++ b/salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-templates-load @@ -6,20 +6,84 @@ {%- import_yaml 'elasticfleet/defaults.yaml' as ELASTICFLEETDEFAULTS %} {% from 'vars/globals.map.jinja' import GLOBALS %} +# Logging configuration +LOG_FILE="/opt/so/log/elasticsearch/so-elasticsearch-templates-load.log" +LOG_DIR=$(dirname "$LOG_FILE") + +# Ensure log directory exists +if [[ ! -d "$LOG_DIR" ]]; then + mkdir -p "$LOG_DIR" +fi + +# Timing variables +SCRIPT_START_TIME=$(date +%s%3N) +LAST_CHECKPOINT_TIME=$SCRIPT_START_TIME + +# Get elapsed time in milliseconds since script start +get_elapsed_ms() { + local now=$(date +%s%3N) + echo $((now - SCRIPT_START_TIME)) +} + +# Get time since last checkpoint and update checkpoint +get_checkpoint_ms() { + local now=$(date +%s%3N) + local elapsed=$((now - LAST_CHECKPOINT_TIME)) + LAST_CHECKPOINT_TIME=$now + echo $elapsed +} + +# Logging function with timing +log() { + local level=$1 + shift + local message="$*" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + local elapsed_ms=$(get_elapsed_ms) + local elapsed_sec=$(echo "scale=3; $elapsed_ms / 1000" | bc) + echo "[$timestamp] [+${elapsed_sec}s] [$level] $message" | tee -a "$LOG_FILE" +} + +log_info() { + log "INFO" "$@" +} + +log_warn() { + log "WARN" "$@" +} + +log_error() { + log "ERROR" "$@" +} + +# Log with checkpoint timing (shows time since last checkpoint) +log_timing() { + local checkpoint_ms=$(get_checkpoint_ms) + local checkpoint_sec=$(echo "scale=3; $checkpoint_ms / 1000" | bc) + local elapsed_ms=$(get_elapsed_ms) + local elapsed_sec=$(echo "scale=3; $elapsed_ms / 1000" | bc) + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] [+${elapsed_sec}s] [TIMING] $* (took ${checkpoint_sec}s)" | tee -a "$LOG_FILE" +} + STATE_FILE_INITIAL=/opt/so/state/estemplates_initial_load_attempt.txt STATE_FILE_SUCCESS=/opt/so/state/estemplates.txt +log_info "========== Starting Elasticsearch templates load ==========" + if [[ -f $STATE_FILE_INITIAL ]]; then # The initial template load has already run. As this is a subsequent load, all dependencies should # already be satisified. Therefore, immediately exit/abort this script upon any template load failure # since this is an unrecoverable failure. should_exit_on_failure=1 + log_info "State file $STATE_FILE_INITIAL exists - this is a subsequent template load" else # This is the initial template load, and there likely are some components not yet setup in Elasticsearch. # Therefore load as many templates as possible at this time and if an error occurs proceed to the next # template. But if at least one template fails to load do not mark the templates as having been loaded. # This will allow the next load to resume the load of the templates that failed to load initially. should_exit_on_failure=0 + log_info "This is the initial template load" echo "This is the initial template load" fi @@ -27,30 +91,51 @@ fi pgrep soup > /dev/null && should_exit_on_failure=0 load_failures=0 +template_count=0 +total_template_time_ms=0 load_template() { uri=$1 file=$2 + + local start_time=$(date +%s%3N) echo "Loading template file $i" if ! retry 3 1 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}"; then + local end_time=$(date +%s%3N) + local duration=$((end_time - start_time)) + total_template_time_ms=$((total_template_time_ms + duration)) + template_count=$((template_count + 1)) if [[ $should_exit_on_failure -eq 1 ]]; then + log_error "FAILED template $file (${duration}ms)" fail "Could not load template file: $file" else load_failures=$((load_failures+1)) + log_warn "FAILED template $file (${duration}ms) - failure count: $load_failures" echo "Incremented load failure counter: $load_failures" fi + else + local end_time=$(date +%s%3N) + local duration=$((end_time - start_time)) + total_template_time_ms=$((total_template_time_ms + duration)) + template_count=$((template_count + 1)) + log_info "OK template $file (${duration}ms)" fi } if [ ! -f $STATE_FILE_SUCCESS ]; then echo "State file $STATE_FILE_SUCCESS not found. Running so-elasticsearch-templates-load." + log_info "State file $STATE_FILE_SUCCESS not found. Running so-elasticsearch-templates-load." + LAST_CHECKPOINT_TIME=$(date +%s%3N) . /usr/sbin/so-common + log_timing "Sourced so-common" {% if GLOBALS.role != 'so-heavynode' %} if [ -f /usr/sbin/so-elastic-fleet-common ]; then + LAST_CHECKPOINT_TIME=$(date +%s%3N) . /usr/sbin/so-elastic-fleet-common + log_timing "Sourced so-elastic-fleet-common" fi {% endif %} @@ -68,14 +153,20 @@ if [ ! -f $STATE_FILE_SUCCESS ]; then if [ -f "$file" ]; then # Wait for ElasticSearch to initialize echo -n "Waiting for ElasticSearch..." + log_info "Waiting for ElasticSearch to initialize..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) retry 240 1 "so-elasticsearch-query / -k --output /dev/null --silent --head --fail" || fail "Connection attempt timed out. Unable to connect to ElasticSearch. \nPlease try: \n -checking log(s) in /var/log/elasticsearch/\n -running 'sudo docker ps' \n -running 'sudo so-elastic-restart'" + log_timing "ElasticSearch connection established" {% if GLOBALS.role != 'so-heavynode' %} TEMPLATE="logs-endpoint.alerts@package" + LAST_CHECKPOINT_TIME=$(date +%s%3N) INSTALLED=$(so-elasticsearch-query _component_template/$TEMPLATE | jq -r .component_templates[0].name) + log_timing "Checked for installed package template $TEMPLATE" if [ "$INSTALLED" != "$TEMPLATE" ]; then echo echo "Packages not yet installed." echo + log_warn "Packages not yet installed - exiting" exit 0 fi {% endif %} @@ -84,40 +175,56 @@ if [ ! -f $STATE_FILE_SUCCESS ]; then cd ${ELASTICSEARCH_TEMPLATES}/component/ecs + log_info "===== Loading ECS component templates from $(pwd) =====" echo "Loading ECS component templates..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) + ecs_count=0 for i in *; do TEMPLATE=$(echo $i | cut -d '.' -f1) load_template "_component_template/${TEMPLATE}-mappings" "$i" + ecs_count=$((ecs_count + 1)) done + log_timing "Completed $ecs_count ECS component templates" echo cd ${ELASTICSEARCH_TEMPLATES}/component/elastic-agent + log_info "===== Loading Elastic Agent component templates from $(pwd) =====" echo "Loading Elastic Agent component templates..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) {% if GLOBALS.role == 'so-heavynode' %} component_pattern="so-*" {% else %} component_pattern="*" {% endif %} + agent_count=0 for i in $component_pattern; do TEMPLATE=${i::-5} load_template "_component_template/$TEMPLATE" "$i" + agent_count=$((agent_count + 1)) done + log_timing "Completed $agent_count Elastic Agent component templates" echo # Load SO-specific component templates cd ${ELASTICSEARCH_TEMPLATES}/component/so + log_info "===== Loading Security Onion component templates from $(pwd) =====" echo "Loading Security Onion component templates..." + LAST_CHECKPOINT_TIME=$(date +%s%3N) + so_count=0 for i in *; do TEMPLATE=$(echo $i | cut -d '.' -f1); load_template "_component_template/$TEMPLATE" "$i" + so_count=$((so_count + 1)) done + log_timing "Completed $so_count Security Onion component templates" echo # Load SO index templates cd ${ELASTICSEARCH_TEMPLATES}/index + log_info "===== Loading Security Onion index templates from $(pwd) =====" echo "Loading Security Onion index templates..." shopt -s extglob {% if GLOBALS.role == 'so-heavynode' %} @@ -131,35 +238,66 @@ if [ ! -f $STATE_FILE_SUCCESS ]; then # In this situation, the script will treat the skipped template as a temporary failure # and allow the templates to be loaded again on the next run or highstate, whichever # comes first. + LAST_CHECKPOINT_TIME=$(date +%s%3N) COMPONENT_LIST=$(so-elasticsearch-component-templates-list) + log_timing "Retrieved component templates list" + LAST_CHECKPOINT_TIME=$(date +%s%3N) + index_count=0 + skipped_count=0 for i in $pattern; do TEMPLATE=${i::-14} COMPONENT_PATTERN=${TEMPLATE:3} MATCH=$(echo "$TEMPLATE" | grep -E "^so-logs-|^so-metrics" | grep -vE "detections|osquery") if [[ -n "$MATCH" && ! "$COMPONENT_LIST" =~ "$COMPONENT_PATTERN" && ! "$COMPONENT_PATTERN" =~ \.generic|logs-winlog\.winlog ]]; then load_failures=$((load_failures+1)) + skipped_count=$((skipped_count + 1)) + log_warn "SKIPPED - Component template does not exist for $COMPONENT_PATTERN" echo "Component template does not exist for $COMPONENT_PATTERN. The index template will not be loaded. Load failures: $load_failures" else load_template "_index_template/$TEMPLATE" "$i" + index_count=$((index_count + 1)) fi done + log_timing "Completed $index_count index templates ($skipped_count skipped)" else {% if GLOBALS.role == 'so-heavynode' %} echo "Common template does not exist. Exiting..." + log_warn "Common template does not exist. Exiting..." {% else %} echo "Elastic Fleet not configured. Exiting..." + log_warn "Elastic Fleet not configured. Exiting..." {% endif %} exit 0 fi cd - >/dev/null + # Final timing summary + total_elapsed_ms=$(get_elapsed_ms) + total_elapsed_sec=$(echo "scale=3; $total_elapsed_ms / 1000" | bc) + avg_template_ms=0 + if [[ $template_count -gt 0 ]]; then + avg_template_ms=$((total_template_time_ms / template_count)) + fi + + log_info "========== TIMING SUMMARY ==========" + log_info "Total templates processed: $template_count" + log_info "Total template load time: ${total_template_time_ms}ms" + log_info "Average time per template: ${avg_template_ms}ms" + log_info "Total script time: ${total_elapsed_sec}s" + log_info "====================================" + if [[ $load_failures -eq 0 ]]; then echo "All templates loaded successfully" + log_info "All templates loaded successfully" touch $STATE_FILE_SUCCESS else echo "Encountered $load_failures templates that were unable to load, likely due to missing dependencies that will be available later; will retry on next highstate" + log_warn "Encountered $load_failures templates that were unable to load" fi else echo "Templates already loaded" + log_info "Templates already loaded (state file $STATE_FILE_SUCCESS exists)" fi + +log_info "========== Elasticsearch templates load complete ($(echo "scale=3; $(get_elapsed_ms) / 1000" | bc)s) =========="