diff --git a/salt/common/tools/sbin/so-log-check b/salt/common/tools/sbin/so-log-check index d8446d6fe..8c8bbf35c 100755 --- a/salt/common/tools/sbin/so-log-check +++ b/salt/common/tools/sbin/so-log-check @@ -229,7 +229,6 @@ if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tcp 127.0.0.1:6791: bind: address already in use" # so-elastic-fleet agent restarting. Seen starting w/ 8.18.8 https://github.com/elastic/kibana/issues/201459 EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint).*user so_kibana lacks the required permissions \[logs-\1" # Known issue with 3 integrations using kibana_system role vs creating unique api creds with proper permissions. EXCLUDED_ERRORS="$EXCLUDED_ERRORS|manifest unknown" # appears in so-dockerregistry log for so-tcpreplay following docker upgrade to 29.2.1-1 - EXCLUDED_ERRORS="$EXCLUDED_ERRORS|-v ON_ERROR_STOP=1" # psql invocation flag from so-postgres init script, not an actual error fi RESULT=0 diff --git a/salt/postgres/telegraf_users.sls b/salt/postgres/telegraf_users.sls index 8c62a8961..920367fab 100644 --- a/salt/postgres/telegraf_users.sls +++ b/salt/postgres/telegraf_users.sls @@ -10,6 +10,24 @@ {% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %} {% if TG_OUT in ['POSTGRES', 'BOTH'] %} +# docker_container.running returns as soon as the container starts, but on +# first-init docker-entrypoint.sh runs init scripts and then restarts +# postgres, so the next docker exec can hit "the database system is shutting +# down". Wait for pg_isready before any psql work. +postgres_wait_ready: + cmd.run: + - name: | + for i in $(seq 1 60); do + if docker exec so-postgres pg_isready -U postgres -q 2>/dev/null; then + exit 0 + fi + sleep 2 + done + echo "so-postgres did not become ready within 120s" >&2 + exit 1 + - require: + - docker_container: so-postgres + # Ensure the shared Telegraf database exists. init-users.sh only runs on a # fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume # would otherwise never get so_telegraf. @@ -21,7 +39,7 @@ postgres_create_telegraf_db: WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec EOSQL - require: - - docker_container: so-postgres + - cmd: postgres_wait_ready # Provision the shared group role and schema once. Every per-minion role is a # member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf diff --git a/salt/telegraf/etc/telegraf.conf b/salt/telegraf/etc/telegraf.conf index ea3e11c51..334b62888 100644 --- a/salt/telegraf/etc/telegraf.conf +++ b/salt/telegraf/etc/telegraf.conf @@ -111,7 +111,7 @@ # pg_partman. Retention drops old partitions instead of row-by-row DELETEs. create_templates = [ '''CREATE TABLE {TABLE} ({COLUMNS}) PARTITION BY RANGE ("time")''', - '''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'native', p_interval := '1 day', p_premake := 3)''' + '''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3)''' ] {%- endif %} diff --git a/setup/so-verify b/setup/so-verify index 8d23275ea..672ed70cc 100755 --- a/setup/so-verify +++ b/setup/so-verify @@ -71,7 +71,8 @@ log_has_errors() { grep -vE "remove_failed_vm.sls" | \ grep -vE "failed to copy: httpReadSeeker" | \ grep -vE "Error response from daemon: failed to resolve reference" | \ - grep -vE "log-.*-pipeline_failed_attempts" &> "$error_log" + grep -vE "log-.*-pipeline_failed_attempts" | \ + grep -vE " -v ON_ERROR_STOP=1" &> "$error_log" if [[ $? -eq 0 ]]; then # This function succeeds (returns 0) if errors are detected