Fix Telegraf→Postgres table creation and state.apply race

- Telegraf's partman template passed p_type:='native', which pg_partman
  5.x (the version shipped by postgresql-17-partman on Debian) rejects.
  Switched to 'range' so partman.create_parent() actually creates
  partitions and Telegraf's INSERTs succeed.
- Added a postgres_wait_ready gate in telegraf_users.sls so psql execs
  don't race the init-time restart that docker-entrypoint.sh performs.
- so-verify now ignores the literal "-v ON_ERROR_STOP=1" token in the
  setup log. Dropped the matching entry from so-log-check, which scans
  container stdout where that token never appears.
This commit is contained in:
Mike Reeves
2026-04-17 13:00:12 -04:00
parent 7d07f3c8fe
commit 5228668be0
4 changed files with 22 additions and 4 deletions
+19 -1
View File
@@ -10,6 +10,24 @@
{% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %}
{% if TG_OUT in ['POSTGRES', 'BOTH'] %}
# docker_container.running returns as soon as the container starts, but on
# first-init docker-entrypoint.sh runs init scripts and then restarts
# postgres, so the next docker exec can hit "the database system is shutting
# down". Wait for pg_isready before any psql work.
postgres_wait_ready:
cmd.run:
- name: |
for i in $(seq 1 60); do
if docker exec so-postgres pg_isready -U postgres -q 2>/dev/null; then
exit 0
fi
sleep 2
done
echo "so-postgres did not become ready within 120s" >&2
exit 1
- require:
- docker_container: so-postgres
# Ensure the shared Telegraf database exists. init-users.sh only runs on a
# fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
# would otherwise never get so_telegraf.
@@ -21,7 +39,7 @@ postgres_create_telegraf_db:
WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec
EOSQL
- require:
- docker_container: so-postgres
- cmd: postgres_wait_ready
# Provision the shared group role and schema once. Every per-minion role is a
# member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf