make so-yaml PG-canonical and add pillar-change reactor stack

Two coupled changes that together let so_pillar.* be the canonical config store, with config edits driving service reloads automatically: so-yaml PG-canonical mode - Adds /opt/so/conf/so-yaml/mode (and SO_YAML_BACKEND env override) with three values: dual (legacy), postgres (PG-only for managed paths), disk (emergency rollback). Bootstrap files (secrets.sls, ca/init.sls, *.nodes.sls, top.sls, ...) stay disk-only regardless via the existing SkipPath allowlist in so_yaml_postgres.locate. - loadYaml/writeYaml/purgeFile now route to so_pillar.* in postgres mode: replace/add/get all read+write the database with no disk file ever appearing. PG failure is fatal in postgres mode (no silent fallback); dual mode preserves the prior best-effort mirror. - so_yaml_postgres gains read_yaml(path), is_pg_managed(path), and is_enabled() so so-yaml can answer "is this path PG-managed and is PG up" without reaching into private helpers. - schema_pillar.sls writes /opt/so/conf/so-yaml/mode = postgres after the importer succeeds, so flipping postgres:so_pillar:enabled flips so-yaml's behavior in lockstep with the schema being live. pg_notify-driven change fan-out - 008_change_notify.sql adds so_pillar.change_queue + an AFTER trigger on pillar_entry that enqueues the locator and pg_notifies 'so_pillar_change'. Queue is drained at-least-once so engine restarts don't lose events; pg_notify is just the wakeup signal. - New salt-master engine pg_notify_pillar.py LISTENs on the channel, drains the queue with FOR UPDATE SKIP LOCKED, debounces bursts, and fires 'so/pillar/changed' events grouped by (scope, role, minion). - Reactor so_pillar_changed.sls catches the tag and dispatches to orch.so_pillar_reload, which carries a DISPATCH map of pillar-path prefix -> (state sls, role grain set) so adding a new service to the auto-reload list is a one-line edit instead of a new reactor. - Engine + reactor wiring is gated on the same postgres:so_pillar:enabled flag as the schema and ext_pillar config so the whole stack flips on/off together. Tests: 21 new cases (112 total, all passing) covering mode resolution, PG-managed detection, and PG-canonical read/write/purge routing with the PG client stubbed.
2026-05-09 04:42:40 +02:00 · 2026-05-01 09:31:48 -04:00
parent 23255f88e0
commit 3d11694d51
13 changed files with 958 additions and 11 deletions
@@ -37,6 +37,16 @@ GRANT SELECT ON so_pillar.v_pillar_global,
    TO so_pillar_master;
 GRANT EXECUTE ON FUNCTION so_pillar.fn_pillar_secrets(text) TO so_pillar_master;

+-- Engine reads + drains the change queue from the salt-master process. It
+-- needs SELECT to find unprocessed rows and UPDATE to mark them processed.
+-- The queue contains only locator metadata (no pillar data), so the master
+-- role's existing privilege footprint is unchanged in practice.
+GRANT SELECT, UPDATE ON so_pillar.change_queue TO so_pillar_master;
+GRANT USAGE ON SEQUENCE so_pillar.change_queue_id_seq TO so_pillar_master;
+-- Writer needs INSERT (the trigger runs as table owner, so this is just for
+-- direct testing / manual replays from psql).
+GRANT INSERT ON so_pillar.change_queue TO so_pillar_writer;
+
 -- Writer needs CRUD on pillar_entry/minion/role_member plus access to seed tables.
 GRANT SELECT, INSERT, UPDATE, DELETE
    ON so_pillar.pillar_entry,
@@ -0,0 +1,77 @@
+-- pg_notify-driven change fan-out for so_pillar.pillar_entry.
+--
+-- Two layers:
+--   1. so_pillar.change_queue          — durable, drained by the salt-master
+--                                        engine. Survives engine downtime,
+--                                        de-duplicated by id, processed once.
+--   2. pg_notify('so_pillar_change')   — wakeup signal. Payload is the
+--                                        change_queue row id and locator
+--                                        (no secret data — channels are
+--                                        snoopable by anyone with LISTEN).
+--
+-- The salt-master engine LISTENs on the channel for low-latency wakeup,
+-- then SELECTs unprocessed change_queue rows so a missed notification
+-- (engine restart, network blip) self-heals on the next event.
+
+CREATE TABLE IF NOT EXISTS so_pillar.change_queue (
+    id            bigserial PRIMARY KEY,
+    scope         text        NOT NULL,
+    role_name     text,
+    minion_id     text,
+    pillar_path   text        NOT NULL,
+    op            text        NOT NULL CHECK (op IN ('INSERT','UPDATE','DELETE')),
+    enqueued_at   timestamptz NOT NULL DEFAULT now(),
+    processed_at  timestamptz
+);
+
+-- Hot index for the engine's drain query.
+CREATE INDEX IF NOT EXISTS ix_change_queue_unprocessed
+    ON so_pillar.change_queue (id)
+    WHERE processed_at IS NULL;
+
+-- Retention index: pg_cron job in 007 sweeps processed rows older than 7d.
+CREATE INDEX IF NOT EXISTS ix_change_queue_processed_at
+    ON so_pillar.change_queue (processed_at)
+    WHERE processed_at IS NOT NULL;
+
+CREATE OR REPLACE FUNCTION so_pillar.fn_pillar_entry_notify()
+    RETURNS trigger
+    LANGUAGE plpgsql
+AS $$
+DECLARE
+    v_row record;
+    v_id  bigint;
+BEGIN
+    IF TG_OP = 'DELETE' THEN
+        v_row := OLD;
+    ELSE
+        v_row := NEW;
+    END IF;
+
+    INSERT INTO so_pillar.change_queue
+        (scope, role_name, minion_id, pillar_path, op)
+    VALUES
+        (v_row.scope, v_row.role_name, v_row.minion_id, v_row.pillar_path, TG_OP)
+    RETURNING id INTO v_id;
+
+    -- Payload is the queue id + locator only. Engine joins back to
+    -- pillar_entry if it needs the data — keeps secrets off the wire.
+    PERFORM pg_notify('so_pillar_change', json_build_object(
+        'queue_id',    v_id,
+        'scope',       v_row.scope,
+        'role_name',   v_row.role_name,
+        'minion_id',   v_row.minion_id,
+        'pillar_path', v_row.pillar_path,
+        'op',          TG_OP
+    )::text);
+
+    RETURN NULL;
+END;
+$$;
+
+DROP TRIGGER IF EXISTS tg_pillar_entry_notify ON so_pillar.pillar_entry;
+CREATE TRIGGER tg_pillar_entry_notify
+    AFTER INSERT OR UPDATE OR DELETE
+    ON so_pillar.pillar_entry
+    FOR EACH ROW
+    EXECUTE FUNCTION so_pillar.fn_pillar_entry_notify();
@@ -54,6 +54,10 @@ so_pillar_postgres_wait_ready:
 {%   do sql_files.append('007_drift_pgcron.sql') %}
 {% endif %}

+# 008 always applies — pg_notify-driven change fan-out is what the salt-master
+# pg_notify_pillar engine consumes. Without it reactor wiring sees no events.
+{% do sql_files.append('008_change_notify.sql') %}
+
 {% for sql_file in sql_files %}
 so_pillar_apply_{{ sql_file | replace('.', '_') }}:
  cmd.run:
@@ -87,7 +91,7 @@ so_pillar_master_key_configure:
          exit 1
        fi
    - require:
-      - cmd: so_pillar_apply_006_rls_sql
+      - cmd: so_pillar_apply_{{ sql_files[-1] | replace('.', '_') }}

 # Run the importer once after the schema is in place. Idempotent — re-runs
 # with no SLS edits produce zero row changes.
@@ -97,6 +101,29 @@ so_pillar_initial_import:
    - require:
      - cmd: so_pillar_master_key_configure

+# Flip so-yaml from dual-write to PG-canonical for managed paths now that
+# the schema and importer are both in place. Bootstrap files (secrets.sls,
+# postgres/auth.sls, ca/init.sls, *.nodes.sls, top.sls, ...) remain on disk
+# regardless because so_yaml_postgres.locate() raises SkipPath for them.
+so_pillar_so_yaml_mode_dir:
+  file.directory:
+    - name: /opt/so/conf/so-yaml
+    - user: socore
+    - group: socore
+    - mode: '0755'
+    - makedirs: True
+
+so_pillar_so_yaml_mode_postgres:
+  file.managed:
+    - name: /opt/so/conf/so-yaml/mode
+    - contents: postgres
+    - user: socore
+    - group: socore
+    - mode: '0644'
+    - require:
+      - file: so_pillar_so_yaml_mode_dir
+      - cmd: so_pillar_initial_import
+
 {% else %}

 so_pillar_disabled_noop: