#!/bin/bash # # Copyright Security Onion Solutions LLC and/or licensed to Security Onion Solutions LLC under one # or more contributor license agreements. Licensed under the Elastic License 2.0 as shown at # https://securityonion.net/license; you may not use this file except in compliance with the # Elastic License 2.0. . /usr/sbin/so-common RECENT_LOG_LINES=200 EXCLUDE_STARTUP_ERRORS=N EXCLUDE_FALSE_POSITIVE_ERRORS=N EXCLUDE_KNOWN_ERRORS=N while [[ $# -gt 0 ]]; do case $1 in --exclude-connection-errors) EXCLUDE_STARTUP_ERRORS=Y ;; --exclude-false-positives) EXCLUDE_FALSE_POSITIVE_ERRORS=Y ;; --exclude-known-errors) EXCLUDE_KNOWN_ERRORS=Y ;; --unknown) EXCLUDE_STARTUP_ERRORS=Y EXCLUDE_FALSE_POSITIVE_ERRORS=Y EXCLUDE_KNOWN_ERRORS=Y ;; --recent-log-lines) shift RECENT_LOG_LINES=$1 ;; *) echo "Usage: $0 [options]" echo "" echo "where options are:" echo " --recent-log-lines N looks at the most recent N log lines per file or container; defaults to 200" echo " --exclude-connection-errors exclude errors caused by a recent server or container restart" echo " --exclude-false-positives exclude logs that are known false positives" echo " --exclude-known-errors exclude errors that are known and non-critical issues" echo " --unknown exclude everything mentioned above; only show unknown errors" echo "" echo "A non-zero return value indicates errors were found" exit 1 ;; esac shift done echo "Security Onion Log Check - $(date)" echo "-------------------------------------------" echo "" echo "- RECENT_LOG_LINES: $RECENT_LOG_LINES" echo "- EXCLUDE_STARTUP_ERRORS: $EXCLUDE_STARTUP_ERRORS" echo "- EXCLUDE_FALSE_POSITIVE_ERRORS: $EXCLUDE_FALSE_POSITIVE_ERRORS" echo "- EXCLUDE_KNOWN_ERRORS: $EXCLUDE_KNOWN_ERRORS" echo "" function status() { header "$1" } function exclude_container() { name=$1 exclude_id=$(docker ps | grep "$name" | awk '{print $1}') if [[ -n "$exclude_id" ]]; then CONTAINER_IDS=$(echo $CONTAINER_IDS | sed -e "s/$exclude_id//g") return $? fi return $? } function exclude_log() { name=$1 cat /tmp/log_check_files | grep -v $name > /tmp/log_check_files.new mv /tmp/log_check_files.new /tmp/log_check_files } function check_for_errors() { if cat /tmp/log_check | grep -i error | grep -vEi "$EXCLUDED_ERRORS"; then RESULT=1 fi } EXCLUDED_ERRORS="__LOG_CHECK_PLACEHOLDER_EXCLUSION__" if [[ $EXCLUDE_STARTUP_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|database is locked" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|econnreset" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|unreachable" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|shutdown process" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|contain valid certificates" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|failedaction" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|block in start_workers" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|block in buffer_initialize" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|no route to host" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|not running" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|unavailable" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|request.py" # server not yet ready (python stack output) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|httperror" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|servfail" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|connect" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|missing shards" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|failed to send metrics" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|broken pipe" # server not yet ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|status: 502" # server not yet ready (nginx waiting on upstream) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|timeout exceeded" # server not yet ready (telegraf waiting on elasticsearch) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|influxsize kbytes" # server not yet ready (telegraf waiting on influx) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|expected field at" # server not yet ready (telegraf waiting on health data) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|connection timed out" # server not yet ready (telegraf plugin unable to connect) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|command timed out" # server not yet ready (telegraf plugin waiting for script to finish) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|cached the public key" # server not yet ready (salt minion waiting on key acceptance) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|no ingest nodes" # server not yet ready (logstash waiting on elastic) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|failed to poll" # server not yet ready (sensoroni waiting on soc) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|minions returned with non" # server not yet ready (salt waiting on minions) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|so_long_term" # server not yet ready (influxdb not yet setup) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|search_phase_execution_exception" # server not yet ready (elastalert running searches before ES is ready) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|timeout retrieving docker" # Telegraf unable to reach Docker engine, rare EXCLUDED_ERRORS="$EXCLUDED_ERRORS|timeout retrieving container" # Telegraf unable to reach Docker engine, rare EXCLUDED_ERRORS="$EXCLUDED_ERRORS|error while communicating" # Elasticsearch MS -> HN "sensor" temporarily unavailable EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tls handshake error" # Docker registry container when new node comes onlines EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Unable to get license information" # Logstash trying to contact ES before it's ready EXCLUDED_ERRORS="$EXCLUDED_ERRORS|process already finished" # Telegraf script finished just as the auto kill timeout kicked in EXCLUDED_ERRORS="$EXCLUDED_ERRORS|No shard available" # Typical error when making a query before ES has finished loading all indices EXCLUDED_ERRORS="$EXCLUDED_ERRORS|responded with status-code 503" # telegraf getting 503 from ES during startup EXCLUDED_ERRORS="$EXCLUDED_ERRORS|process_cluster_event_timeout_exception" # logstash waiting for elasticsearch to start EXCLUDED_ERRORS="$EXCLUDED_ERRORS|not configured for GeoIP" # SO does not bundle the maxminddb with Zeek fi if [[ $EXCLUDE_FALSE_POSITIVE_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|elastalert_status_error" # false positive EXCLUDED_ERRORS="$EXCLUDED_ERRORS|elastalert_error" # false positive EXCLUDED_ERRORS="$EXCLUDED_ERRORS|error: '0'" # false positive EXCLUDED_ERRORS="$EXCLUDED_ERRORS|errors_index" # false positive EXCLUDED_ERRORS="$EXCLUDED_ERRORS|noerror" # false positive EXCLUDED_ERRORS="$EXCLUDED_ERRORS|outofmemoryerror" # false positive (elastic command line) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|adding component template" # false positive (elastic security) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|adding index template" # false positive (elastic security) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|fs_errors" # false positive (suricata stats) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|error-template" # false positive (elastic templates) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|deprecated" # false positive (playbook) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|windows" # false positive (playbook) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|could cause errors" # false positive (playbook) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|_error.yml" # false positive (playbook) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|id.orig_h" # false positive (zeek test data) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|emerging-all.rules" # false positive (error in rulename) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|invalid query input" # false positive (Invalid user input in hunt query) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|example" # false positive (example test data) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|status 200" # false positive (request successful, contained error string in content) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|app_layer.error" # false positive (suricata 7) in stats.log e.g. app_layer.error.imap.parser | Total | 0 EXCLUDED_ERRORS="$EXCLUDED_ERRORS|is not an ip string literal" # false positive (Open Canary logging out blank IP addresses) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|syncing rule" # false positive (rule sync log line includes rule name which can contain 'error') EXCLUDED_ERRORS="$EXCLUDED_ERRORS|request_unauthorized" # false positive (login failures to Hydra result in an 'error' log) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|adding index lifecycle policy" # false positive (elasticsearch policy names contain 'error') EXCLUDED_ERRORS="$EXCLUDED_ERRORS|adding ingest pipeline" # false positive (elasticsearch ingest pipeline names contain 'error') EXCLUDED_ERRORS="$EXCLUDED_ERRORS|updating index template" # false positive (elasticsearch index or template names contain 'error') EXCLUDED_ERRORS="$EXCLUDED_ERRORS|updating component template" # false positive (elasticsearch index or template names contain 'error') fi if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|eof" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|raise" # redis/python generic stack line, rely on other lines for actual error EXCLUDED_ERRORS="$EXCLUDED_ERRORS|fail\\(error\\)" # redis/python generic stack line, rely on other lines for actual error EXCLUDED_ERRORS="$EXCLUDED_ERRORS|urlerror" # idstools connection timeout EXCLUDED_ERRORS="$EXCLUDED_ERRORS|timeouterror" # idstools connection timeout EXCLUDED_ERRORS="$EXCLUDED_ERRORS|_ml" # Elastic ML errors EXCLUDED_ERRORS="$EXCLUDED_ERRORS|context canceled" # elastic agent during shutdown EXCLUDED_ERRORS="$EXCLUDED_ERRORS|geoip databases update" # airgap can't update GeoIP DB EXCLUDED_ERRORS="$EXCLUDED_ERRORS|filenotfounderror" # bug in 2.4.10 filecheck salt state caused duplicate cronjobs EXCLUDED_ERRORS="$EXCLUDED_ERRORS|salt-minion-check" # bug in early 2.4 place Jinja script in non-jinja salt dir causing cron output errors EXCLUDED_ERRORS="$EXCLUDED_ERRORS|monitoring.metrics" # known issue with elastic agent casting the field incorrectly if an integer value shows up before a float EXCLUDED_ERRORS="$EXCLUDED_ERRORS|repodownload.conf" # known issue with reposync on pre-2.4.20 EXCLUDED_ERRORS="$EXCLUDED_ERRORS|missing versions record" # stenographer corrupt index EXCLUDED_ERRORS="$EXCLUDED_ERRORS|soc.field." # known ingest type collisions issue with earlier versions of SO EXCLUDED_ERRORS="$EXCLUDED_ERRORS|error parsing signature" # Malformed Suricata rule, from upstream provider EXCLUDED_ERRORS="$EXCLUDED_ERRORS|sticky buffer has no matches" # Non-critical Suricata error EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Unable to determine destination index stats" # Elastic transform temporary error EXCLUDED_ERRORS="$EXCLUDED_ERRORS|cannot join on an empty table" # InfluxDB flux query, import nodes EXCLUDED_ERRORS="$EXCLUDED_ERRORS|exhausting result iterator" # InfluxDB flux query mismatched table results (temporary data issue) EXCLUDED_ERRORS="$EXCLUDED_ERRORS|failed to finish run" # InfluxDB rare error, self-recoverable EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Unable to gather disk name" # InfluxDB known error, can't read disks because the container doesn't have them mounted EXCLUDED_ERRORS="$EXCLUDED_ERRORS|iteration" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|communication packets" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|use of closed" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|bookkeeper" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|noindices" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|failed to start transient scope" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|so-user.lock exists" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|systemd-run" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|retcode: 1" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|telemetry-task" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|redisqueue" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|fleet_detail_query" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|num errors=0" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|provisioning/alerting" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|provisioning/notifiers" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|provisoning/plugins" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|active-responses.log" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|scanentropy" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|integration policy" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|blob unknown" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|token required" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|zeekcaptureloss" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|unable to create detection" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|error installing new prebuilt rules" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|parent.error" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|req.LocalMeta.host.ip" # known issue in GH EXCLUDED_ERRORS="$EXCLUDED_ERRORS|sendmail" # zeek EXCLUDED_ERRORS="$EXCLUDED_ERRORS|stats.log" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Unknown column" # Elastalert errors from running EQL queries EXCLUDED_ERRORS="$EXCLUDED_ERRORS|parsing_exception" # Elastalert EQL parsing issue. Temp. EXCLUDED_ERRORS="$EXCLUDED_ERRORS|context deadline exceeded" EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Error running query:" # Specific issues with detection rules EXCLUDED_ERRORS="$EXCLUDED_ERRORS|detect-parse" # Suricata encountering a malformed rule EXCLUDED_ERRORS="$EXCLUDED_ERRORS|integrity check failed" # Detections: Exclude false positive due to automated testing EXCLUDED_ERRORS="$EXCLUDED_ERRORS|syncErrors" # Detections: Not an actual error EXCLUDED_ERRORS="$EXCLUDED_ERRORS|Initialized license manager" # SOC log: before fields.status was changed to fields.licenseStatus EXCLUDED_ERRORS="$EXCLUDED_ERRORS|from NIC checksum offloading" # zeek reporter.log EXCLUDED_ERRORS="$EXCLUDED_ERRORS|marked for removal" # docker container getting recycled fi RESULT=0 # Check Security Onion container stdout/stderr logs CONTAINER_IDS=$(docker ps -q) exclude_container so-kibana # kibana error logs are too verbose with large varieties of errors most of which are temporary exclude_container so-idstools # ignore due to known issues and noisy logging exclude_container so-playbook # Playbook is removed as of 2.4.70, disregard output in stopped containers exclude_container so-mysql # MySQL is removed as of 2.4.70, disregard output in stopped containers exclude_container so-soctopus # Soctopus is removed as of 2.4.70, disregard output in stopped containers for container_id in $CONTAINER_IDS; do container_name=$(docker ps --format json | jq ". | select(.ID==\"$container_id\")|.Names") status "Checking container $container_name" docker logs -n $RECENT_LOG_LINES $container_id > /tmp/log_check 2>&1 check_for_errors done # Check Security Onion related log files find /opt/so/log/ /nsm -name \*.log > /tmp/log_check_files if [[ -f /var/log/cron ]]; then echo "/var/log/cron" >> /tmp/log_check_files fi exclude_log "kibana.log" # kibana error logs are too verbose with large varieties of errors most of which are temporary exclude_log "spool" # disregard zeek analyze logs as this is data specific exclude_log "import" # disregard imported test data the contains error strings exclude_log "update.log" # ignore playbook updates due to several known issues exclude_log "cron-cluster-delete.log" # ignore since Curator has been removed exclude_log "cron-close.log" # ignore since Curator has been removed exclude_log "curator.log" # ignore since Curator has been removed exclude_log "playbook.log" # Playbook is removed as of 2.4.70, logs may still be on disk exclude_log "mysqld.log" # MySQL is removed as of 2.4.70, logs may still be on disk exclude_log "soctopus.log" # Soctopus is removed as of 2.4.70, logs may still be on disk exclude_log "agentstatus.log" # ignore this log since it tracks agents in error state exclude_log "detections_runtime-status_yara.log" # temporarily ignore this log until Detections is more stable exclude_log "/nsm/kafka/data/" # ignore Kafka data directory from log check. # Include Zeek reporter.log to detect errors after running known good pcap(s) through sensor echo "/nsm/zeek/spool/logger/reporter.log" >> /tmp/log_check_files for log_file in $(cat /tmp/log_check_files); do status "Checking log file $log_file" tail -n $RECENT_LOG_LINES $log_file > /tmp/log_check check_for_errors done # Cleanup temp files rm -f /tmp/log_check_files rm -f /tmp/log_check if [[ $RESULT -eq 0 ]]; then echo -e "\nResult: No errors found" else echo -e "\nResult: One or more errors found" fi exit $RESULT