From 5228668be01a324a95c0fe3b2fa81d9f68885ed8 Mon Sep 17 00:00:00 2001 From: Mike Reeves Date: Fri, 17 Apr 2026 13:00:12 -0400 Subject: [PATCH] =?UTF-8?q?Fix=20Telegraf=E2=86=92Postgres=20table=20creat?= =?UTF-8?q?ion=20and=20state.apply=20race?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Telegraf's partman template passed p_type:='native', which pg_partman 5.x (the version shipped by postgresql-17-partman on Debian) rejects. Switched to 'range' so partman.create_parent() actually creates partitions and Telegraf's INSERTs succeed. - Added a postgres_wait_ready gate in telegraf_users.sls so psql execs don't race the init-time restart that docker-entrypoint.sh performs. - so-verify now ignores the literal "-v ON_ERROR_STOP=1" token in the setup log. Dropped the matching entry from so-log-check, which scans container stdout where that token never appears. --- salt/common/tools/sbin/so-log-check | 1 - salt/postgres/telegraf_users.sls | 20 +++++++++++++++++++- salt/telegraf/etc/telegraf.conf | 2 +- setup/so-verify | 3 ++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/salt/common/tools/sbin/so-log-check b/salt/common/tools/sbin/so-log-check index d8446d6fe..8c8bbf35c 100755 --- a/salt/common/tools/sbin/so-log-check +++ b/salt/common/tools/sbin/so-log-check @@ -229,7 +229,6 @@ if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tcp 127.0.0.1:6791: bind: address already in use" # so-elastic-fleet agent restarting. Seen starting w/ 8.18.8 https://github.com/elastic/kibana/issues/201459 EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint).*user so_kibana lacks the required permissions \[logs-\1" # Known issue with 3 integrations using kibana_system role vs creating unique api creds with proper permissions. EXCLUDED_ERRORS="$EXCLUDED_ERRORS|manifest unknown" # appears in so-dockerregistry log for so-tcpreplay following docker upgrade to 29.2.1-1 - EXCLUDED_ERRORS="$EXCLUDED_ERRORS|-v ON_ERROR_STOP=1" # psql invocation flag from so-postgres init script, not an actual error fi RESULT=0 diff --git a/salt/postgres/telegraf_users.sls b/salt/postgres/telegraf_users.sls index 8c62a8961..920367fab 100644 --- a/salt/postgres/telegraf_users.sls +++ b/salt/postgres/telegraf_users.sls @@ -10,6 +10,24 @@ {% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %} {% if TG_OUT in ['POSTGRES', 'BOTH'] %} +# docker_container.running returns as soon as the container starts, but on +# first-init docker-entrypoint.sh runs init scripts and then restarts +# postgres, so the next docker exec can hit "the database system is shutting +# down". Wait for pg_isready before any psql work. +postgres_wait_ready: + cmd.run: + - name: | + for i in $(seq 1 60); do + if docker exec so-postgres pg_isready -U postgres -q 2>/dev/null; then + exit 0 + fi + sleep 2 + done + echo "so-postgres did not become ready within 120s" >&2 + exit 1 + - require: + - docker_container: so-postgres + # Ensure the shared Telegraf database exists. init-users.sh only runs on a # fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume # would otherwise never get so_telegraf. @@ -21,7 +39,7 @@ postgres_create_telegraf_db: WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec EOSQL - require: - - docker_container: so-postgres + - cmd: postgres_wait_ready # Provision the shared group role and schema once. Every per-minion role is a # member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf diff --git a/salt/telegraf/etc/telegraf.conf b/salt/telegraf/etc/telegraf.conf index ea3e11c51..334b62888 100644 --- a/salt/telegraf/etc/telegraf.conf +++ b/salt/telegraf/etc/telegraf.conf @@ -111,7 +111,7 @@ # pg_partman. Retention drops old partitions instead of row-by-row DELETEs. create_templates = [ '''CREATE TABLE {TABLE} ({COLUMNS}) PARTITION BY RANGE ("time")''', - '''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'native', p_interval := '1 day', p_premake := 3)''' + '''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3)''' ] {%- endif %} diff --git a/setup/so-verify b/setup/so-verify index 8d23275ea..672ed70cc 100755 --- a/setup/so-verify +++ b/setup/so-verify @@ -71,7 +71,8 @@ log_has_errors() { grep -vE "remove_failed_vm.sls" | \ grep -vE "failed to copy: httpReadSeeker" | \ grep -vE "Error response from daemon: failed to resolve reference" | \ - grep -vE "log-.*-pipeline_failed_attempts" &> "$error_log" + grep -vE "log-.*-pipeline_failed_attempts" | \ + grep -vE " -v ON_ERROR_STOP=1" &> "$error_log" if [[ $? -eq 0 ]]; then # This function succeeds (returns 0) if errors are detected