Fix Telegraf→Postgres table creation and state.apply race

- Telegraf's partman template passed p_type:='native', which pg_partman 5.x (the version shipped by postgresql-17-partman on Debian) rejects. Switched to 'range' so partman.create_parent() actually creates partitions and Telegraf's INSERTs succeed. - Added a postgres_wait_ready gate in telegraf_users.sls so psql execs don't race the init-time restart that docker-entrypoint.sh performs. - so-verify now ignores the literal "-v ON_ERROR_STOP=1" token in the setup log. Dropped the matching entry from so-log-check, which scans container stdout where that token never appears.
2026-07-29 04:03:26 +02:00 · 2026-04-17 13:00:12 -04:00
parent 7d07f3c8fe
commit 5228668be0
4 changed files with 22 additions and 4 deletions
@@ -229,7 +229,6 @@ if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tcp 127.0.0.1:6791: bind: address already in use" # so-elastic-fleet agent restarting. Seen starting w/ 8.18.8 https://github.com/elastic/kibana/issues/201459
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint).*user so_kibana lacks the required permissions \[logs-\1" # Known issue with 3 integrations using kibana_system role vs creating unique api creds with proper permissions.
    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|manifest unknown"             # appears in so-dockerregistry log for so-tcpreplay following docker upgrade to 29.2.1-1
-    EXCLUDED_ERRORS="$EXCLUDED_ERRORS|-v ON_ERROR_STOP=1"            # psql invocation flag from so-postgres init script, not an actual error
 fi

 RESULT=0
@@ -10,6 +10,24 @@
 {% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %}
 {% if TG_OUT in ['POSTGRES', 'BOTH'] %}

+# docker_container.running returns as soon as the container starts, but on
+# first-init docker-entrypoint.sh runs init scripts and then restarts
+# postgres, so the next docker exec can hit "the database system is shutting
+# down". Wait for pg_isready before any psql work.
+postgres_wait_ready:
+  cmd.run:
+    - name: |
+        for i in $(seq 1 60); do
+          if docker exec so-postgres pg_isready -U postgres -q 2>/dev/null; then
+            exit 0
+          fi
+          sleep 2
+        done
+        echo "so-postgres did not become ready within 120s" >&2
+        exit 1
+    - require:
+      - docker_container: so-postgres
+
 # Ensure the shared Telegraf database exists. init-users.sh only runs on a
 # fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
 # would otherwise never get so_telegraf.
@@ -21,7 +39,7 @@ postgres_create_telegraf_db:
        WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec
        EOSQL
    - require:
-      - docker_container: so-postgres
+      - cmd: postgres_wait_ready

 # Provision the shared group role and schema once. Every per-minion role is a
 # member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf
@@ -111,7 +111,7 @@
  # pg_partman. Retention drops old partitions instead of row-by-row DELETEs.
  create_templates = [
    '''CREATE TABLE {TABLE} ({COLUMNS}) PARTITION BY RANGE ("time")''',
-    '''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'native', p_interval := '1 day', p_premake := 3)'''
+    '''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3)'''
  ]
 {%- endif %}

@@ -71,7 +71,8 @@ log_has_errors() {
        grep -vE "remove_failed_vm.sls" | \
        grep -vE "failed to copy: httpReadSeeker" | \
        grep -vE "Error response from daemon: failed to resolve reference" | \
-        grep -vE "log-.*-pipeline_failed_attempts" &> "$error_log"
+        grep -vE "log-.*-pipeline_failed_attempts" | \
+        grep -vE " -v ON_ERROR_STOP=1" &> "$error_log"
    
    if [[ $? -eq 0 ]]; then
        # This function succeeds (returns 0) if errors are detected