Fix Telegraf→Postgres table creation and state.apply race

- Telegraf's partman template passed p_type:='native', which pg_partman
  5.x (the version shipped by postgresql-17-partman on Debian) rejects.
  Switched to 'range' so partman.create_parent() actually creates
  partitions and Telegraf's INSERTs succeed.
- Added a postgres_wait_ready gate in telegraf_users.sls so psql execs
  don't race the init-time restart that docker-entrypoint.sh performs.
- so-verify now ignores the literal "-v ON_ERROR_STOP=1" token in the
  setup log. Dropped the matching entry from so-log-check, which scans
  container stdout where that token never appears.
This commit is contained in:
Mike Reeves
2026-04-17 13:00:12 -04:00
parent 7d07f3c8fe
commit 5228668be0
4 changed files with 22 additions and 4 deletions
-1
View File
@@ -229,7 +229,6 @@ if [[ $EXCLUDE_KNOWN_ERRORS == 'Y' ]]; then
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|tcp 127.0.0.1:6791: bind: address already in use" # so-elastic-fleet agent restarting. Seen starting w/ 8.18.8 https://github.com/elastic/kibana/issues/201459
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|TransformTask\] \[logs-(tychon|aws_billing|microsoft_defender_endpoint).*user so_kibana lacks the required permissions \[logs-\1" # Known issue with 3 integrations using kibana_system role vs creating unique api creds with proper permissions.
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|manifest unknown" # appears in so-dockerregistry log for so-tcpreplay following docker upgrade to 29.2.1-1
EXCLUDED_ERRORS="$EXCLUDED_ERRORS|-v ON_ERROR_STOP=1" # psql invocation flag from so-postgres init script, not an actual error
fi
RESULT=0
+19 -1
View File
@@ -10,6 +10,24 @@
{% set TG_OUT = (GLOBALS.telegraf_output | default('INFLUXDB')) | upper %}
{% if TG_OUT in ['POSTGRES', 'BOTH'] %}
# docker_container.running returns as soon as the container starts, but on
# first-init docker-entrypoint.sh runs init scripts and then restarts
# postgres, so the next docker exec can hit "the database system is shutting
# down". Wait for pg_isready before any psql work.
postgres_wait_ready:
cmd.run:
- name: |
for i in $(seq 1 60); do
if docker exec so-postgres pg_isready -U postgres -q 2>/dev/null; then
exit 0
fi
sleep 2
done
echo "so-postgres did not become ready within 120s" >&2
exit 1
- require:
- docker_container: so-postgres
# Ensure the shared Telegraf database exists. init-users.sh only runs on a
# fresh data dir, so hosts upgraded onto an existing /nsm/postgres volume
# would otherwise never get so_telegraf.
@@ -21,7 +39,7 @@ postgres_create_telegraf_db:
WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'so_telegraf')\gexec
EOSQL
- require:
- docker_container: so-postgres
- cmd: postgres_wait_ready
# Provision the shared group role and schema once. Every per-minion role is a
# member of so_telegraf, and each Telegraf connection does SET ROLE so_telegraf
+1 -1
View File
@@ -111,7 +111,7 @@
# pg_partman. Retention drops old partitions instead of row-by-row DELETEs.
create_templates = [
'''CREATE TABLE {TABLE} ({COLUMNS}) PARTITION BY RANGE ("time")''',
'''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'native', p_interval := '1 day', p_premake := 3)'''
'''SELECT partman.create_parent(p_parent_table := {TABLELITERAL}, p_control := 'time', p_type := 'range', p_interval := '1 day', p_premake := 3)'''
]
{%- endif %}
+2 -1
View File
@@ -71,7 +71,8 @@ log_has_errors() {
grep -vE "remove_failed_vm.sls" | \
grep -vE "failed to copy: httpReadSeeker" | \
grep -vE "Error response from daemon: failed to resolve reference" | \
grep -vE "log-.*-pipeline_failed_attempts" &> "$error_log"
grep -vE "log-.*-pipeline_failed_attempts" | \
grep -vE " -v ON_ERROR_STOP=1" &> "$error_log"
if [[ $? -eq 0 ]]; then
# This function succeeds (returns 0) if errors are detected