From ed892a80c342cf489ed40d851927b621f2e9a79d Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Thu, 28 May 2026 08:48:42 +0200
Subject: [PATCH] tests: add multi-PG mesh installcheck

Builds PG REL_15/16/17/18_STABLE + Spock against each in parallel,
applies patches/<ver>/*.diff, wires four single-node clusters into a
12-subscription full mesh (exception_behaviour='discard', auto-DDL on),
stresses it with `make installcheck-parallel` on n1 (PG15), then
verifies the mesh survived.

Success requires both:
  - sub_enabled = true on every subscription, and
  - spock.sync_event() round-trips from each provider to every other
    node within 10s -- the authoritative end-to-end check. The
    status='replicating' snapshot is also polled but diagnostic only,
    because apply workers can flicker 'down' through their
    restart_delay window right after installcheck.

Layout (all under multi-pg-installcheck/ in the repo, gitignored):
  src/pgNN, bin/pgNN, spock-build/pgNN, pgdata/nN, sock/, pid/
  log/<node>.log              per-builder summary
  log/<node>-<phase>.log      raw output of each command
  log/main.log                orchestrator events
  log/installcheck.log        make installcheck output
  log/sync-event-check.log    per-edge wait_for_sync_event output

Bash 3.2 compatible (macOS /bin/bash). Re-runs reuse existing PG and
Spock binaries by default; pass --force to rebuild. Terminal output
is one OK/FAILED line per phase plus the final RESULT, with the raw
psql state of every node printed at the end.

Companion workflow .github/workflows/installcheck-multi-pg.yml runs
on push and workflow_dispatch; needs only bison + flex on top of
ubuntu-latest.
---
 .github/workflows/installcheck-multi-pg.yml |   81 ++
 .gitignore                                  |    7 +
 tests/run-multi-pg-installcheck.sh          | 1109 +++++++++++++++++++
 3 files changed, 1197 insertions(+)
 create mode 100644 .github/workflows/installcheck-multi-pg.yml
 create mode 100755 tests/run-multi-pg-installcheck.sh
diff --git a/.github/workflows/installcheck-multi-pg.yml b/.github/workflows/installcheck-multi-pg.yml
new file mode 100644
index 00000000..3940ad8e
--- /dev/null
+++ b/.github/workflows/installcheck-multi-pg.yml
@@ -0,0 +1,81 @@
+#
+# Multi-PG Spock mesh installcheck (no Docker)
+#
+# Builds PostgreSQL REL_15/16/17/18_STABLE plus the Spock extension against
+# each, wires the four single-node clusters into a full Spock mesh
+# (12 subscriptions, exception_behaviour='discard', auto-DDL on),
+# stresses it with `make installcheck-parallel` against n1 (PG15),
+# then asserts that every subscription is still enabled and that
+# spock.sync_event() round-trips on every directed edge.
+#
+# The whole thing is driven by tests/run-multi-pg-installcheck.sh, so
+# it runs identically on a developer laptop.  This workflow's job is
+# just: provision deps, invoke the script, save logs on failure.
+#
+# Dependency footprint is dictated by the script's ./configure flags
+# (see _do_configure_pg in tests/run-multi-pg-installcheck.sh):
+#
+#   --with-icu       -> libicu-dev
+#   --with-openssl   -> libssl-dev
+#   --with-readline  -> libreadline-dev
+#   --with-zstd      -> libzstd-dev
+#   --with-lz4       -> liblz4-dev
+#   (default zlib)   -> zlib1g-dev
+#   parser/scanner   -> bison, flex
+#   ICU pkg detect   -> pkg-config (preinstalled, listed for clarity)
+#
+# Everything else the script touches -- gcc, make, perl, git, rsync,
+# ca-certificates -- is preinstalled on ubuntu-latest.  If you change
+# the configure command in the script, sync this list with it.
+#
+
+name: Installcheck (multi-PG mesh)
+run-name: Multi-PG mesh installcheck + sync_event verification
+
+on:
+  workflow_dispatch:          # manual: "Run workflow" button in the UI
+  push:                       # automatic: every commit on every branch
+    paths-ignore:
+      - '**/*.md'
+      - 'docs/**'
+      - 'mkdocs.yml'
+
+permissions:
+  contents: read
+
+jobs:
+  installcheck-multi-pg:
+    runs-on: ubuntu-latest
+    timeout-minutes: 90
+
+    steps:
+      - name: Checkout spock
+        uses: actions/checkout@v4
+
+      - name: Install build dependencies
+        run: |
+          sudo apt-get update -qq
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
+            --no-install-recommends \
+              bison flex pkg-config \
+              libicu-dev libssl-dev libreadline-dev \
+              libzstd-dev liblz4-dev zlib1g-dev
+
+      - name: Run multi-PG installcheck
+        run: |
+          # BASE_DIR defaults to <repo>/multi-pg-installcheck which in CI
+          # is ${GITHUB_WORKSPACE}/multi-pg-installcheck -- exactly the
+          # paths the upload-artifact step below points at.
+          ./tests/run-multi-pg-installcheck.sh --jobs "$(nproc)"
+
+      - name: Collect logs on failure
+        if: ${{ failure() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: installcheck-multi-pg-logs
+          path: |
+            multi-pg-installcheck/log/**
+            multi-pg-installcheck/src/pg15/src/test/regress/regression.diffs
+            multi-pg-installcheck/src/pg15/src/test/regress/regression.out
+          if-no-files-found: ignore
+          retention-days: 7
diff --git a/.gitignore b/.gitignore
index fbe0df49..de083897 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,10 @@ spock.control
 tags
 .vscode
 tmp/
+
+# Multi-PG installcheck test rig (tests/run-multi-pg-installcheck.sh).
+# Contains per-version PG clones, builds, installs, PGDATA dirs and logs;
+# can grow to several gigabytes.  Always ignored -- re-create by running
+# the script.  Deliberately not dot-prefixed so the directory is visible
+# in Finder / ls and easy to inspect.
+multi-pg-installcheck/
diff --git a/tests/run-multi-pg-installcheck.sh b/tests/run-multi-pg-installcheck.sh
new file mode 100755
index 00000000..14513176
--- /dev/null
+++ b/tests/run-multi-pg-installcheck.sh
@@ -0,0 +1,1109 @@
+#!/usr/bin/env bash
+#
+# tests/run-multi-pg-installcheck.sh
+#
+# Build PostgreSQL REL_{15,16,17,18}_STABLE plus the Spock extension against
+# each, start four single-node clusters wired into a full Spock mesh
+# (12 subscriptions, exception_behaviour='discard', auto-DDL on), run
+# `make installcheck` against node n1 (PG15), and verify every subscription
+# is still enabled afterwards.
+#
+# Compatible with bash 3.2 (macOS /bin/bash) -- no associative arrays.
+#
+# The four per-node builders (clone PG, build PG, build Spock, initdb, start)
+# run as parallel subprocesses; the main process waits on all of them, then
+# probes each with pg_isready before moving on to the Spock wiring.
+#
+# Designed to run both locally and inside a GitHub Actions workflow on
+# ubuntu-latest.  No Docker.
+#
+# Layout (under BASE_DIR, default <spock-repo>/multi-pg-installcheck):
+#   src/pg15..pg18         PG source clones (one branch each)
+#   bin/pg15..pg18         PG installs (configure --prefix)
+#   spock-build/pg15..18   per-version Spock source copy + build artefacts
+#   pgdata/n1..n4          PGDATA per node
+#   log/                   per-instance log files:
+#       <node>.log                    summary (one line per phase)
+#       <node>-pg-clone.log           git clone output
+#       <node>-pg-patch.log           git apply of patches/<ver>/*.diff
+#       <node>-pg-configure.log       ./configure output
+#       <node>-pg-build.log           make + make install + regress install
+#       <node>-spock-build.log        spock make + install
+#       <node>-initdb.log             initdb output
+#       <node>-pg-start.log           pg_ctl start output
+#       <node>-server.log             postgres server log
+#       <node>-createdb.log           createdb regression
+#       <node>-spock-bootstrap.log    CREATE EXTENSION + node_create + sub_create's
+#       installcheck.log              make installcheck output
+#       sync-event-check.log          per-edge wait_for_sync_event output
+#   sock/                  unix-socket dir shared by all nodes
+#   pid/                   per-builder PID files
+#
+# Node mapping:
+#   n1 -> PG15, port 57515
+#   n2 -> PG16, port 57516
+#   n3 -> PG17, port 57517
+#   n4 -> PG18, port 57518
+#
+# Subscription naming:
+#   sub_<provider>_<subscriber>; e.g. sub_n1_n2 lives on n2 and pulls from n1.
+#
+# Usage:
+#   tests/run-multi-pg-installcheck.sh [--base-dir DIR] [--keep] \
+#                                      [--force] [--jobs N]
+#
+# Existing PG installs (bin/postgres present) and Spock installs
+# (extension/spock.control present) are reused by default to speed up
+# re-runs.  Pass --force to rebuild everything from scratch.
+#
+# Exit status:
+#   0  every Spock subscription on every node is still enabled
+#      (sub_enabled = true)  AND  a spock.sync_event() fired on each
+#      provider was applied on every other node within
+#      SYNC_EVENT_TIMEOUT seconds.  sync_event is the authoritative
+#      end-to-end check -- it proves bytes round-trip from provider
+#      to subscriber.  status='replicating' is also polled (timeout
+#      WAIT_REPLICATING_TIMEOUT_POST), but only as diagnostic info:
+#      apply workers can briefly flicker 'down' through their
+#      restart_delay window right after installcheck, and a sub that
+#      recovers will still deliver sync_event correctly.
+#      Regression-suite pass/fail is logged but does NOT influence
+#      the exit code -- the installcheck workload is used as stress;
+#      the only success signal is the surviving mesh.
+#   2  one or more subscriptions ended up disabled, OR
+#      sync_event failed to propagate on at least one edge within
+#      SYNC_EVENT_TIMEOUT seconds.
+#   >2 build / setup error.
+#
+
+# Deliberately NOT using `-E` (errtrace).  With -E the ERR trap is
+# inherited into command substitutions and other subshells; combined
+# with on_err's old cleanup logic, a single transient psql failure
+# inside something like `n="$(psql_on ...)"` was enough to silently
+# shut down every cluster.  Without -E, ERR fires only at the main
+# shell level, subshell failures propagate normally via return codes,
+# and the trap stays a single-shot abort handler.
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SPOCK_SRC="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+# Default base dir lives inside the spock repo so all artefacts are
+# contained next to the source.  This means the script can be launched
+# from anywhere without scattering data across the filesystem; it also
+# means re-runs from a different cwd reuse the same clones/builds/data.
+# Overridden by --base-dir.
+BASE_DIR="${SPOCK_SRC}/multi-pg-installcheck"
+KEEP_RUNNING=0
+# Reuse is the default; --force flips this on to rebuild everything.
+FORCE_REBUILD=0
+# Total parallelism for ./configure && make; we run 4 builders in parallel
+# so the per-builder concurrency is JOBS_PER_BUILDER.
+JOBS_TOTAL="$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)"
+
+PG_VERSIONS="15 16 17 18"   # space-separated; iterated with `for ver in`
+NODES="n1 n2 n3 n4"
+
+DBNAME=regression
+DBUSER=regression
+TARGET_NODE=n1   # node we run `make installcheck` against
+
+# Try a couple of git remotes; the first one that resolves wins.  Use a
+# fallback so the script keeps working if the official postgresql.org host
+# is unreachable from the runner (e.g. behind a strict outbound proxy).
+PG_GIT_REMOTES="https://git.postgresql.org/git/postgresql.git https://github.com/postgres/postgres.git"
+
+# ---------------------------------------------------------------------------
+# Bash-3-safe lookup helpers (no `declare -A`)
+# ---------------------------------------------------------------------------
+
+ver_to_node() {
+	case "$1" in
+		15) echo n1 ;;
+		16) echo n2 ;;
+		17) echo n3 ;;
+		18) echo n4 ;;
+		*)  return 1 ;;
+	esac
+}
+
+node_to_ver() {
+	case "$1" in
+		n1) echo 15 ;;
+		n2) echo 16 ;;
+		n3) echo 17 ;;
+		n4) echo 18 ;;
+		*)  return 1 ;;
+	esac
+}
+
+ver_to_port() {
+	case "$1" in
+		15) echo 57515 ;;
+		16) echo 57516 ;;
+		17) echo 57517 ;;
+		18) echo 57518 ;;
+		*)  return 1 ;;
+	esac
+}
+
+node_to_port() { ver_to_port "$(node_to_ver "$1")"; }
+
+# ---------------------------------------------------------------------------
+# Logging / error trap
+# ---------------------------------------------------------------------------
+
+# log() writes only to disk -- the terminal stays clean.  Output goes to
+# the caller's $NODE_LOG when set (per-builder summary file), otherwise
+# to $MAIN_LOG (the orchestrator's own log).  Both files live under
+# $LOG_DIR; everything the user might want to read after the run lands
+# in one of them.
+log() {
+	local msg
+	msg="[$(date +%H:%M:%S)] $*"
+	if [ -n "${NODE_LOG:-}" ]; then
+		printf '%s\n' "${msg}" >>"${NODE_LOG}"
+	elif [ -n "${MAIN_LOG:-}" ]; then
+		printf '%s\n' "${msg}" >>"${MAIN_LOG}"
+	fi
+}
+
+# say() is for the few things the user must see on the terminal: the
+# final RESULT line and fatal errors.  Use sparingly.
+say() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
+
+fail() { say "FATAL: $1"; log "FATAL: $1"; exit "${2:-3}"; }
+
+# run_phase NODE PHASE CMD ARGS...
+#   Runs CMD with stdout+stderr captured to ${LOG_DIR}/<node>-<phase>.log.
+#   Records start/finish in the per-node summary log AND emits a single
+#   end-of-phase OK/FAILED line on the terminal via say(), so the user
+#   gets a per-step heartbeat without seeing any command output itself.
+run_phase() {
+	local node="$1" phase="$2"
+	shift 2
+	local logf="${LOG_DIR}/${node}-${phase}.log"
+	log "${node}: [${phase}] start  -> ${logf}"
+	local rc=0
+	# `|| rc=$?` captures the real exit code and prevents set -e from
+	# tripping inside this checked context.
+	"$@" >"${logf}" 2>&1 || rc=$?
+	if [ "${rc}" -ne 0 ]; then
+		log "${node}: [${phase}] FAILED rc=${rc}  (see ${logf})"
+		say "${node}: ${phase} FAILED rc=${rc}  (see ${logf})"
+		return "${rc}"
+	fi
+	log "${node}: [${phase}] ok"
+	say "${node}: ${phase} ok"
+}
+
+trap 'on_err $? $LINENO' ERR
+
+on_err() {
+	local rc=$1 line=$2
+	log "Aborted: exit ${rc} at line ${line}"
+	dump_logs_on_failure || true
+	# Deliberately NO stop_all_nodes here.  It used to live in this
+	# trap, but combined with set -E it would fire from inside command
+	# substitutions and silently kill every cluster on a single
+	# transient psql hiccup.  Teardown lives in main()'s normal flow
+	# (where it has the correct context); kill_outstanding_builders is
+	# still safe to call here because it operates on PIDs we know are
+	# ours.
+	kill_outstanding_builders || true
+	exit "${rc}"
+}
+
+# ---------------------------------------------------------------------------
+# Argument parsing
+# ---------------------------------------------------------------------------
+
+# Print the entire leading comment block (everything from line 2 up to,
+# but not including, the first non-comment line).  Resilient to future
+# growth of the doc header.
+usage() {
+	awk 'NR>1 { if ($0 !~ /^#/) exit; print }' "$0"
+}
+
+while [ "$#" -gt 0 ]; do
+	case "$1" in
+		--base-dir)         BASE_DIR="$2"; shift 2 ;;
+		--keep)             KEEP_RUNNING=1; shift ;;
+		--force)            FORCE_REBUILD=1; shift ;;
+		--jobs)             JOBS_TOTAL="$2"; shift 2 ;;
+		-h|--help)          usage; exit 0 ;;
+		*)                  fail "unknown argument: $1" 4 ;;
+	esac
+done
+
+mkdir -p "${BASE_DIR}/src"          \
+         "${BASE_DIR}/bin"          \
+         "${BASE_DIR}/spock-build"  \
+         "${BASE_DIR}/pgdata"       \
+         "${BASE_DIR}/log"          \
+         "${BASE_DIR}/sock"         \
+         "${BASE_DIR}/pid"
+BASE_DIR="$(cd "${BASE_DIR}" && pwd)"
+SOCK_DIR="${BASE_DIR}/sock"
+LOG_DIR="${BASE_DIR}/log"
+PID_DIR="${BASE_DIR}/pid"
+
+# Fresh log and pid directories per run: stale lines from a previous
+# run would otherwise commingle with new output and make diagnosis
+# painful.  Deliberately scoped to log/ and pid/ -- src/, bin/,
+# spock-build/, and pgdata/ are preserved so reuse-on-rerun still works.
+rm -rf "${LOG_DIR}" "${PID_DIR}"
+mkdir -p "${LOG_DIR}" "${PID_DIR}"
+
+MAIN_LOG="${LOG_DIR}/main.log"
+: >"${MAIN_LOG}"   # fresh orchestrator log per run
+
+# Per-builder concurrency: divide total jobs by 4 builders, minimum 1.
+JOBS_PER_BUILDER=$(( JOBS_TOTAL / 4 ))
+[ "${JOBS_PER_BUILDER}" -lt 1 ] && JOBS_PER_BUILDER=1
+
+log "BASE_DIR         = ${BASE_DIR}"
+log "SPOCK_SRC        = ${SPOCK_SRC}"
+log "JOBS_TOTAL       = ${JOBS_TOTAL}"
+log "JOBS_PER_BUILDER = ${JOBS_PER_BUILDER}"
+log "PG_VERSIONS      = ${PG_VERSIONS}"
+
+# ---------------------------------------------------------------------------
+# Path helpers
+# ---------------------------------------------------------------------------
+
+prefix_for()      { echo "${BASE_DIR}/bin/pg$1"; }
+src_for()         { echo "${BASE_DIR}/src/pg$1"; }
+spock_build_for() { echo "${BASE_DIR}/spock-build/pg$1"; }
+data_for()        { echo "${BASE_DIR}/pgdata/$1"; }
+pg_config_for()   { echo "$(prefix_for "$1")/bin/pg_config"; }
+
+# DSN that talks over the shared Unix socket directory.
+dsn_for_node() {
+	local node="$1"
+	local port; port="$(node_to_port "${node}")"
+	echo "host=${SOCK_DIR} port=${port} dbname=${DBNAME} user=${DBUSER}"
+}
+
+# Run psql against a node using that node's own client binaries.
+psql_on() {
+	local node="$1"; shift
+	local ver;    ver="$(node_to_ver "${node}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local port;   port="$(node_to_port "${node}")"
+	PGPASSWORD="" "${prefix}/bin/psql" \
+		-X -v ON_ERROR_STOP=1 \
+		-h "${SOCK_DIR}" -p "${port}" \
+		-U "${DBUSER}" -d "${DBNAME}" \
+		"$@"
+}
+
+# ---------------------------------------------------------------------------
+# Builder: clone + build PG + build Spock + initdb + start (one node)
+# ---------------------------------------------------------------------------
+
+# Pick the first reachable git remote for PG.
+pick_pg_remote() {
+	local r
+	for r in ${PG_GIT_REMOTES}; do
+		# `git ls-remote` is cheap and validates reachability for our branch.
+		if git ls-remote --exit-code --heads "${r}" "REL_$1_STABLE" \
+				>/dev/null 2>&1; then
+			echo "${r}"
+			return 0
+		fi
+	done
+	return 1
+}
+
+_do_clone_pg() {
+	local ver="$1" remote="$2"
+	local branch="REL_${ver}_STABLE"
+	local src; src="$(src_for "${ver}")"
+	rm -rf "${src}"
+	git clone --depth=1 --single-branch --branch "${branch}" \
+		"${remote}" "${src}"
+}
+
+clone_pg() {
+	local ver="$1"
+	local node; node="$(ver_to_node "${ver}")"
+	local branch="REL_${ver}_STABLE"
+	local src; src="$(src_for "${ver}")"
+
+	if [ -d "${src}/.git" ] \
+		&& [ -f "${src}/src/test/regress/parallel_schedule" ]; then
+		log "${node}: [pg-clone] PG${ver} source already present, skipping"
+		return 0
+	fi
+
+	local remote
+	remote="$(pick_pg_remote "${ver}")" \
+		|| fail "PG${ver}: no reachable git remote for ${branch}" 5
+
+	log "${node}: [pg-clone] PG${ver} ${branch} from ${remote}"
+	run_phase "${node}" pg-clone _do_clone_pg "${ver}" "${remote}"
+}
+
+# Spock needs Postgres with per-version patches applied (they add the
+# symbols spock_apply.c uses, e.g. remoteTransactionStopTimestamp and
+# SubTransactionIdSetCommitTsData).  Patches live in patches/<ver>/ in
+# the spock tree and are applied in lexical order via `git apply`.
+#
+# git apply is used (not `patch`) because:
+#   - it reads the same unified-diff format as the existing patches/<ver>/*.diff
+#   - it's atomic per patch: a failed apply leaves the tree untouched
+#   - it understands rename/mode metadata correctly when we ever add such patches
+#
+# A marker file makes the phase idempotent across re-runs.
+_do_patch_pg() {
+	local ver="$1" src="$2" patch_dir="$3"
+	if [ ! -d "${patch_dir}" ]; then
+		echo "no patch directory ${patch_dir} -- nothing to do"
+		return 0
+	fi
+	local p any=0
+	# `set -- patches/*.diff` — bash sorts lexicographically already.
+	for p in "${patch_dir}"/*.diff "${patch_dir}"/*.patch; do
+		[ -f "${p}" ] || continue
+		any=1
+		echo "----- applying $(basename "${p}") -----"
+		# git apply needs to run from within the PG source tree.
+		( cd "${src}" && git apply --whitespace=nowarn -p1 "${p}" )
+	done
+	if [ "${any}" -eq 0 ]; then
+		echo "no .diff/.patch files in ${patch_dir}"
+	fi
+	touch "${src}/.spock-patches-applied"
+}
+
+patch_pg() {
+	local ver="$1"
+	local node; node="$(ver_to_node "${ver}")"
+	local src;  src="$(src_for "${ver}")"
+	local patch_dir="${SPOCK_SRC}/patches/${ver}"
+
+	if [ -f "${src}/.spock-patches-applied" ]; then
+		log "${node}: [pg-patch] patches already applied (marker present), skipping"
+		return 0
+	fi
+	run_phase "${node}" pg-patch _do_patch_pg "${ver}" "${src}" "${patch_dir}"
+}
+
+_do_configure_pg() {
+	local src="$1" prefix="$2"
+	cd "${src}"
+	./configure --prefix="${prefix}" --enable-debug --enable-cassert \
+		--with-icu --with-openssl --with-readline --with-zstd --with-lz4
+}
+
+_do_build_pg() {
+	local src="$1" jobs="$2"
+	make -C "${src}" -s -j"${jobs}"
+	make -C "${src}" -s -j"${jobs}" install
+	# Install pg_regress so `make installcheck` later can find it via $bindir.
+	make -C "${src}/src/test/regress" -s install
+}
+
+build_pg() {
+	local ver="$1"
+	local node; node="$(ver_to_node "${ver}")"
+	local src;    src="$(src_for "${ver}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+
+	# Reuse an existing PG install unless --force was passed.
+	if [ "${FORCE_REBUILD}" -eq 0 ] && [ -x "${prefix}/bin/postgres" ]; then
+		log "${node}: [pg-build] reusing existing install at ${prefix}"
+		return 0
+	fi
+
+	run_phase "${node}" pg-configure _do_configure_pg "${src}" "${prefix}"
+	run_phase "${node}" pg-build     _do_build_pg     "${src}" "${JOBS_PER_BUILDER}"
+}
+
+# Each builder gets its own copy of the Spock source tree and builds in
+# it independently.  No shared state between builders => no mutex, no
+# coordination -- all four spock builds run truly in parallel, same as
+# the PG builds.
+#
+# rsync is used because its `--exclude='/X'` is anchored to the source
+# root identically on GNU and BSD systems; the leading '/' is what keeps
+# us from copying BASE_DIR (which lives inside SPOCK_SRC) into itself.
+_do_build_spock() {
+	local sb="$1" pgc="$2" jobs="$3"
+	rm -rf "${sb}"
+	mkdir -p "${sb}"
+	rsync -a \
+		--exclude='/multi-pg-installcheck' \
+		--exclude='/.git' \
+		--exclude='.DS_Store' \
+		"${SPOCK_SRC}/" "${sb}/"
+	make -C "${sb}" PG_CONFIG="${pgc}" -j"${jobs}"
+	make -C "${sb}" PG_CONFIG="${pgc}" install
+}
+
+build_spock() {
+	local ver="$1"
+	local node; node="$(ver_to_node "${ver}")"
+	local pgc;  pgc="$(pg_config_for "${ver}")"
+	local sb;   sb="$(spock_build_for "${ver}")"
+
+	# Reuse an already-installed Spock unless --force was passed.
+	if [ "${FORCE_REBUILD}" -eq 0 ] \
+		&& [ -f "$("${pgc}" --sharedir)/extension/spock.control" ]; then
+		log "${node}: [spock-build] reusing existing install for PG${ver}"
+		return 0
+	fi
+
+	run_phase "${node}" spock-build _do_build_spock \
+		"${sb}" "${pgc}" "${JOBS_PER_BUILDER}"
+}
+
+_do_initdb() {
+	local prefix="$1" data="$2" user="$3"
+	"${prefix}/bin/initdb" -D "${data}" -U "${user}" \
+		--encoding=UTF8 --locale=C
+}
+
+init_node() {
+	local node="$1"
+	local ver;    ver="$(node_to_ver "${node}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local data;   data="$(data_for "${node}")"
+	local port;   port="$(ver_to_port "${ver}")"
+
+	if [ -d "${data}" ]; then
+		log "${node}: [initdb] clearing existing data dir"
+		rm -rf "${data}"
+	fi
+	run_phase "${node}" initdb _do_initdb "${prefix}" "${data}" "${DBUSER}"
+
+	cat >>"${data}/postgresql.conf" <<-EOF
+		# --- multi-PG installcheck test rig ---
+		listen_addresses = ''
+		unix_socket_directories = '${SOCK_DIR}'
+		port = ${port}
+		max_connections = 200
+
+		wal_level = logical
+		track_commit_timestamp = on
+		max_worker_processes = 32
+		max_replication_slots = 32
+		max_wal_senders = 32
+
+		log_min_messages = 'log'
+		log_statement = 'none'
+		logging_collector = off
+
+		shared_preload_libraries = 'spock'
+		spock.conflict_resolution = 'last_update_wins'
+		spock.exception_behaviour = 'discard'
+		spock.save_resolutions = on
+
+		# Replicate DDL automatically across the mesh, and add any tables
+		# created by that DDL to the default replication set so they
+		# actually start flowing through the subscriptions.
+		spock.enable_ddl_replication = on
+		spock.include_ddl_repset    = on
+		spock.allow_ddl_from_functions = on
+	EOF
+
+	# Trust on the shared Unix socket; no TCP listener so this is local-only.
+	cat >>"${data}/pg_hba.conf" <<-EOF
+		local all all trust
+		local replication all trust
+	EOF
+}
+
+_do_pg_ctl_start() {
+	local prefix="$1" data="$2" server_log="$3"
+	"${prefix}/bin/pg_ctl" -D "${data}" -l "${server_log}" -w -t 60 start
+}
+
+start_node() {
+	local node="$1"
+	local ver;    ver="$(node_to_ver "${node}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local data;   data="$(data_for "${node}")"
+	run_phase "${node}" pg-start _do_pg_ctl_start \
+		"${prefix}" "${data}" "${LOG_DIR}/${node}-server.log"
+}
+
+stop_node() {
+	local node="$1"
+	local ver;    ver="$(node_to_ver "${node}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local data;   data="$(data_for "${node}")"
+	if [ -f "${data}/postmaster.pid" ]; then
+		log "${node}: pg_ctl stop"
+		"${prefix}/bin/pg_ctl" -D "${data}" -m fast -w -t 60 stop || true
+	fi
+}
+
+stop_all_nodes() {
+	local node
+	for node in ${NODES}; do stop_node "${node}"; done
+}
+
+# The whole per-node pipeline -- runs as a backgrounded subprocess.
+# Sets NODE_LOG so every log() call inside this subshell additionally
+# appends to ${LOG_DIR}/<node>.log (the per-instance summary).
+builder_for_node() {
+	local ver="$1"
+	local node; node="$(ver_to_node "${ver}")"
+	NODE_LOG="${LOG_DIR}/${node}.log"
+	: >"${NODE_LOG}"
+	log "${node}: builder starting for PG${ver} (pid $$)"
+	# Builder subshell: plain set -e is enough; we don't want -E for the
+	# same reason main shell doesn't (subshell trap cascade).
+	set -euo pipefail
+	clone_pg     "${ver}"
+	patch_pg     "${ver}"
+	build_pg     "${ver}"
+	build_spock  "${ver}"
+	init_node    "${node}"
+	start_node   "${node}"
+	log "${node}: builder finished OK"
+}
+
+# ---------------------------------------------------------------------------
+# Launch all four builders in parallel and wait
+# ---------------------------------------------------------------------------
+
+launch_builders() {
+	local ver node pid
+	rm -f "${PID_DIR}"/*.pid 2>/dev/null || true
+	for ver in ${PG_VERSIONS}; do
+		node="$(ver_to_node "${ver}")"
+		log "${node}: launching builder for PG${ver}  -> ${LOG_DIR}/${node}.log"
+		# No outer redirect: each phase routes its own output to a
+		# ${LOG_DIR}/<node>-<phase>.log file, and the script's own log()
+		# calls tee into ${LOG_DIR}/<node>.log via $NODE_LOG.  Anything
+		# that escapes those falls through to the parent's stderr, which
+		# is exactly what we want for live progress.
+		(
+			builder_for_node "${ver}"
+		) &
+		pid=$!
+		echo "${pid}" >"${PID_DIR}/${node}.pid"
+		log "${node}: builder pid=${pid}"
+	done
+}
+
+# Wait for each builder; return non-zero if any failed.
+wait_for_builders() {
+	local ver node pid rc overall=0
+	for ver in ${PG_VERSIONS}; do
+		node="$(ver_to_node "${ver}")"
+		pid="$(cat "${PID_DIR}/${node}.pid")"
+		# Don't trip the ERR trap if a builder dies; we want to summarise.
+		set +e
+		wait "${pid}"
+		rc=$?
+		set -e
+		if [ ${rc} -eq 0 ]; then
+			log "${node}: builder OK"
+		else
+			overall=1
+			log "${node}: builder FAILED rc=${rc}  (see ${LOG_DIR}/${node}.log)"
+		fi
+	done
+	return ${overall}
+}
+
+kill_outstanding_builders() {
+	local f pid
+	for f in "${PID_DIR}"/*.pid; do
+		[ -f "${f}" ] || continue
+		pid="$(cat "${f}" 2>/dev/null || true)"
+		[ -n "${pid}" ] || continue
+		if kill -0 "${pid}" 2>/dev/null; then
+			kill -TERM "${pid}" 2>/dev/null || true
+		fi
+	done
+}
+
+# ---------------------------------------------------------------------------
+# pg_isready probe for each node
+# ---------------------------------------------------------------------------
+
+wait_for_ready() {
+	local node="$1"
+	local ver;    ver="$(node_to_ver "${node}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local port;   port="$(ver_to_port "${ver}")"
+	local deadline=$(( $(date +%s) + 60 ))
+	while [ "$(date +%s)" -lt "${deadline}" ]; do
+		if "${prefix}/bin/pg_isready" -q \
+				-h "${SOCK_DIR}" -p "${port}" -d "${DBNAME}" -U "${DBUSER}"; then
+			log "${node}: pg_isready OK"
+			return 0
+		fi
+		sleep 1
+	done
+	log "${node}: pg_isready did not become ready within 60s"
+	return 1
+}
+
+wait_for_all_ready() {
+	local node rc=0
+	for node in ${NODES}; do
+		wait_for_ready "${node}" || rc=1
+	done
+	return ${rc}
+}
+
+# ---------------------------------------------------------------------------
+# DB + Spock bootstrap (after all servers are up)
+# ---------------------------------------------------------------------------
+
+_do_createdb() {
+	local prefix="$1" port="$2" user="$3" dbname="$4" sock="$5"
+	"${prefix}/bin/createdb" -h "${sock}" -p "${port}" \
+		-U "${user}" -O "${user}" "${dbname}"
+}
+
+create_db_for_node() {
+	local node="$1"
+	local ver;    ver="$(node_to_ver "${node}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local port;   port="$(ver_to_port "${ver}")"
+	run_phase "${node}" createdb _do_createdb \
+		"${prefix}" "${port}" "${DBUSER}" "${DBNAME}" "${SOCK_DIR}"
+}
+
+setup_spock_node() {
+	local node="$1"
+	local logf="${LOG_DIR}/${node}-spock-bootstrap.log"
+	log "${node}: [spock-bootstrap] CREATE EXTENSION + node_create  -> ${logf}"
+	{
+		psql_on "${node}" -c "CREATE EXTENSION IF NOT EXISTS spock;"
+		psql_on "${node}" <<-SQL
+			SELECT spock.node_create(
+				node_name := '${node}',
+				dsn       := '$(dsn_for_node "${node}")'
+			);
+		SQL
+	} >>"${logf}" 2>&1
+}
+
+create_subscription() {
+	local provider="$1" subscriber="$2"
+	local subname="sub_${provider}_${subscriber}"
+	local provider_dsn; provider_dsn="$(dsn_for_node "${provider}")"
+	local logf="${LOG_DIR}/${subscriber}-spock-bootstrap.log"
+
+	log "${subscriber}: [spock-bootstrap] sub_create ${subname} <- ${provider}"
+	psql_on "${subscriber}" >>"${logf}" 2>&1 <<-SQL
+		SELECT spock.sub_create(
+			subscription_name     := '${subname}',
+			provider_dsn          := '${provider_dsn}',
+			synchronize_structure := false,
+			synchronize_data      := false,
+			forward_origins       := '{}'::text[],
+			enabled               := true
+		);
+	SQL
+}
+
+wire_full_mesh() {
+	local subscriber provider
+	for subscriber in ${NODES}; do
+		for provider in ${NODES}; do
+			[ "${provider}" = "${subscriber}" ] && continue
+			create_subscription "${provider}" "${subscriber}"
+		done
+	done
+}
+
+# ---------------------------------------------------------------------------
+# Run `make installcheck` against the target node
+# ---------------------------------------------------------------------------
+
+run_installcheck() {
+	local node="${TARGET_NODE}"
+	local ver;    ver="$(node_to_ver "${node}")"
+	local src;    src="$(src_for "${ver}")"
+	local prefix; prefix="$(prefix_for "${ver}")"
+	local port;   port="$(ver_to_port "${ver}")"
+
+	log "${node} (PG${ver}): make installcheck-parallel"
+
+	# --use-existing is critical: without it, pg_regress starts the run by
+	# issuing  DROP DATABASE regression / CREATE DATABASE regression , and
+	# the DROP fails the moment Spock has any live replication slot into
+	# this database:
+	#
+	#   ERROR:  database "regression" is used by an active logical
+	#           replication slot
+	#
+	# With --use-existing pg_regress reuses the database we prepared and
+	# leaves it intact when the run finishes, which is also what
+	# verify_subs_enabled needs to read the post-installcheck state.
+	set +e
+	(
+		cd "${src}/src/test/regress"
+		PATH="${prefix}/bin:${PATH}" \
+		PGHOST="${SOCK_DIR}" PGPORT="${port}" \
+		PGUSER="${DBUSER}" PGDATABASE="${DBNAME}" \
+		make -k installcheck-parallel \
+			USE_INSTALLED=1 \
+			EXTRA_REGRESS_OPTS="--use-existing --host=${SOCK_DIR} --port=${port} --user=${DBUSER}"
+	) >"${LOG_DIR}/installcheck.log" 2>&1
+	local rc=$?
+	set -e
+
+	# Regression-suite failures are expected -- Spock's auto-DDL
+	# replication will mangle the test fixtures.  We log the outcome
+	# for diagnostics but always return success: the only failure
+	# signal that matters is verify_subs_enabled().
+	if [ ${rc} -ne 0 ]; then
+		log "installcheck completed with regression failures (exit ${rc})"
+		log "  see ${LOG_DIR}/installcheck.log and ${src}/src/test/regress/regression.diffs"
+	else
+		log "installcheck completed cleanly (no regression diffs)"
+	fi
+	return 0
+}
+
+# ---------------------------------------------------------------------------
+# Wait for every sub to report status='replicating'
+# ---------------------------------------------------------------------------
+
+# Default timeouts for the two callers of wait_for_mesh_replicating.
+#   PRE  -- right after wire_full_mesh.  Generous: slot creation +
+#           walreceiver startup may take a moment on a slow box.
+#   POST -- right after installcheck.  Diagnostic only: apply workers
+#           may go through one or two restart_delay cycles after a
+#           heavy run, so this snapshot is informational; sync_event
+#           is what actually decides the test.
+WAIT_REPLICATING_TIMEOUT_PRE=60
+WAIT_REPLICATING_TIMEOUT_POST=30
+
+# Same idea as tests/regress/sql/alter_options.sql:wait_replicating(), but
+# evaluated from the bash side so we can iterate across all four nodes.
+# A sub's status comes from spock.sub_show_status(); the possible values
+# are 'replicating', 'down', 'init', 'unknown' (see
+# src/spock_apply.c:get_subscription_status).  Caller supplies the timeout
+# in seconds; returns 0 if every sub on every node reaches 'replicating'
+# within that window, non-zero on timeout.
+wait_for_mesh_replicating() {
+	local timeout="${1:-${WAIT_REPLICATING_TIMEOUT_PRE}}"
+	local deadline node not_replicating
+	deadline=$(( $(date +%s) + timeout ))
+	while [ "$(date +%s)" -lt "${deadline}" ]; do
+		not_replicating=0
+		for node in ${NODES}; do
+			local n
+			n="$(psql_on "${node}" -At -c \
+				"SELECT count(*) FROM spock.sub_show_status() \
+				 WHERE status IS DISTINCT FROM 'replicating';" \
+				2>/dev/null)" || n=999
+			if [ "${n}" -ne 0 ]; then
+				not_replicating=1
+				break
+			fi
+		done
+		if [ "${not_replicating}" -eq 0 ]; then
+			log "all 12 subscriptions reached status='replicating' (within ${timeout}s)"
+			return 0
+		fi
+		sleep 1
+	done
+	log "timed out after ${timeout}s waiting for subs to reach 'replicating'"
+	return 1
+}
+
+# Snapshot the full subscription state on every node into per-node logs.
+# Called twice (pre- and post-installcheck) so we have a before/after
+# diff if something regresses.
+print_subscription_state() {
+	local label="$1"
+	local node
+	for node in ${NODES}; do
+		local nlog="${LOG_DIR}/${node}.log"
+		{
+			printf '\n--- subscription state (%s) on %s ---\n' "${label}" "${node}"
+			psql_on "${node}" -P null='(null)' \
+				-c "SELECT sub_name, sub_enabled FROM spock.subscription
+				    ORDER BY sub_name;" \
+				-c "SELECT subscription_name, status, provider_node, slot_name
+				    FROM spock.sub_show_status()
+				    ORDER BY subscription_name;"
+		} >>"${nlog}" 2>&1 || true
+	done
+}
+
+# Print the connection parameters for every node so the user can attach
+# from another terminal.  Useful primarily with --keep, but also handy
+# as a historical record otherwise (re-launchable next time).
+print_connection_params() {
+	local node ver port prefix
+	printf '\n=== Connection parameters ===\n' >&2
+	if [ "${KEEP_RUNNING}" -eq 1 ]; then
+		printf '(servers are left running -- you can attach right now)\n' >&2
+	else
+		printf '(servers will be stopped on script exit; re-run with --keep to keep them up)\n' >&2
+	fi
+	for node in ${NODES}; do
+		ver="$(node_to_ver "${node}")"
+		port="$(ver_to_port "${ver}")"
+		prefix="$(prefix_for "${ver}")"
+		# Two forms for convenience: one human-readable, one ready to paste.
+		printf '\n  %s  (PG%s)\n' "${node}" "${ver}" >&2
+		printf '    host    = %s\n' "${SOCK_DIR}" >&2
+		printf '    port    = %s\n' "${port}"     >&2
+		printf '    user    = %s\n' "${DBUSER}"   >&2
+		printf '    dbname  = %s\n' "${DBNAME}"   >&2
+		printf '    psql    = %s/bin/psql -h %s -p %s -U %s -d %s\n' \
+			"${prefix}" "${SOCK_DIR}" "${port}" "${DBUSER}" "${DBNAME}" >&2
+	done
+	printf '\n' >&2
+}
+
+# Same two SELECTs as print_subscription_state(), but the aligned psql
+# output goes straight to the terminal (stderr) so the user sees the
+# unmassaged final state of every subscription on every node.  This is
+# the "honest SELECT output" that lives next to the final RESULT line.
+print_subscription_state_to_screen() {
+	local node ver
+	for node in ${NODES}; do
+		ver="$(node_to_ver "${node}")"
+		printf '\n=== %s (PG%s) ===\n' "${node}" "${ver}" >&2
+		# Probe first with a cheap SELECT 1; if the server isn't
+		# reachable, say so cleanly instead of letting the two SELECTs
+		# below splat raw psql connection errors at the user.
+		if ! psql_on "${node}" -At -c 'SELECT 1' >/dev/null 2>&1; then
+			printf '  (not reachable -- server is stopped or socket is gone)\n' >&2
+			continue
+		fi
+		psql_on "${node}" -P null='(null)' \
+			-c "SELECT sub_name, sub_enabled
+			    FROM spock.subscription
+			    ORDER BY sub_name;" \
+			-c "SELECT subscription_name, status, provider_node, slot_name
+			    FROM spock.sub_show_status()
+			    ORDER BY subscription_name;" \
+			1>&2 2>&1 || true
+	done
+	printf '\n' >&2
+}
+
+# ---------------------------------------------------------------------------
+# End-to-end propagation check using spock.sync_event / wait_for_sync_event
+# ---------------------------------------------------------------------------
+#
+# status='replicating' only says the apply worker is *connected*.  It
+# doesn't prove that bytes are actually moving end-to-end.  The
+# sync_event primitive is the canonical way to test that:
+#
+#   1. On the provider:    SELECT spock.sync_event();
+#                          -- emits a marker into WAL, returns its LSN.
+#
+#   2. On the subscriber:  CALL spock.wait_for_sync_event(
+#                              NULL, '<provider_name>', '<lsn>', <timeout>);
+#                          -- blocks until that marker has been applied.
+#                          Result is true on delivery, false on timeout.
+#
+# We fire one sync_event per provider (4 total) and fan out the wait to
+# the other three nodes (12 directional checks).  Any timeout means
+# replication is not actually flowing for that pair -- FAIL.
+
+SYNC_EVENT_TIMEOUT=10
+
+_wait_one_sync_event() {
+	local provider="$1" subscriber="$2" lsn="$3"
+	# Wrap the CALL in a DO block that raises on timeout, so psql's
+	# exit code distinguishes success (rc=0) from "did not arrive"
+	# (rc!=0).  Avoids parsing the OUT parameter from CALL output.
+	psql_on "${subscriber}" -q -c "
+		DO \$check\$
+		DECLARE
+			r bool;
+		BEGIN
+			CALL spock.wait_for_sync_event(
+				r, '${provider}'::name, '${lsn}'::pg_lsn,
+				${SYNC_EVENT_TIMEOUT});
+			IF NOT r THEN
+				RAISE EXCEPTION
+					'sync_event from ${provider} did not arrive on ${subscriber} within ${SYNC_EVENT_TIMEOUT}s';
+			END IF;
+		END
+		\$check\$;
+	"
+}
+
+check_sync_event_propagation() {
+	local fail=0 provider subscriber lsn
+	local logf="${LOG_DIR}/sync-event-check.log"
+	: >"${logf}"
+
+	for provider in ${NODES}; do
+		# Emit a sync event on the provider and capture its LSN.
+		lsn="$(psql_on "${provider}" -At \
+			-c "SELECT spock.sync_event();" 2>/dev/null)" \
+			|| lsn=
+		if [ -z "${lsn}" ]; then
+			log "${provider}: spock.sync_event() emit failed"
+			fail=1
+			continue
+		fi
+		log "${provider}: emitted sync_event @ ${lsn}"
+
+		for subscriber in ${NODES}; do
+			[ "${subscriber}" = "${provider}" ] && continue
+			if _wait_one_sync_event "${provider}" "${subscriber}" "${lsn}" \
+					>>"${logf}" 2>&1; then
+				log "${provider} -> ${subscriber}: sync_event delivered"
+			else
+				log "${provider} -> ${subscriber}: sync_event NOT delivered within ${SYNC_EVENT_TIMEOUT}s"
+				fail=1
+			fi
+		done
+	done
+	return ${fail}
+}
+
+# ---------------------------------------------------------------------------
+# Verify every subscription is still enabled
+# ---------------------------------------------------------------------------
+
+verify_subs_enabled() {
+	local node out any_bad=0
+	for node in ${NODES}; do
+		# Suppress psql stderr -- on a dead server we don't want raw
+		# libpq complaints leaking to the terminal; the connection
+		# state is reported by print_subscription_state_to_screen.
+		out="$(psql_on "${node}" -At -c \
+			"SELECT sub_name FROM spock.subscription WHERE NOT sub_enabled ORDER BY sub_name;" \
+			2>/dev/null)" \
+			|| { any_bad=1; log "${node}: NOT reachable -- treating as failure"; continue; }
+		if [ -n "${out}" ]; then
+			any_bad=1
+			# Route the list of disabled subs into main.log -- no
+			# screen noise; the final RESULT line will direct the
+			# user to the right per-node log file.
+			log "${node}: DISABLED subscriptions: $(echo ${out} | tr '\n' ' ')"
+		else
+			log "${node}: all subscriptions still enabled"
+		fi
+	done
+	return "${any_bad}"
+}
+
+# ---------------------------------------------------------------------------
+# Failure diagnostics
+# ---------------------------------------------------------------------------
+
+dump_logs_on_failure() {
+	# Quiet by design: per-phase logs already hold everything; just point
+	# the user at the directory.  No stderr noise.
+	say "see ${LOG_DIR}/ for per-instance log files"
+}
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+main() {
+	launch_builders
+	wait_for_builders || fail "one or more builders failed" 5
+
+	wait_for_all_ready || fail "one or more nodes never became ready" 6
+
+	local node
+	for node in ${NODES}; do create_db_for_node   "${node}"; done
+	for node in ${NODES}; do setup_spock_node     "${node}"; done
+
+	wire_full_mesh
+
+	# Wait for every sub to report status='replicating' instead of
+	# guessing with sleep.  Same loop pattern as the regression test
+	# helper in tests/regress/sql/alter_options.sql.
+	local wait_rc=0
+	wait_for_mesh_replicating "${WAIT_REPLICATING_TIMEOUT_PRE}" || wait_rc=$?
+
+	# Snapshot the subscription state before hammering n1 with the
+	# regression suite -- useful for comparing against the post-run
+	# state if something regresses.
+	print_subscription_state "before installcheck"
+
+	if [ "${wait_rc}" -ne 0 ]; then
+		say "WARNING: not all subscriptions reached 'replicating' before installcheck (see ${LOG_DIR}/<node>.log)"
+	fi
+
+	# Always run installcheck.  Its own pass/fail is irrelevant to the
+	# script's exit code -- regression failures are expected stress and
+	# get logged for diagnostics only.
+	run_installcheck
+
+	# Re-snapshot the state -- if installcheck-induced traffic disabled
+	# or de-synced a subscription, this is where we see it.
+	print_subscription_state "after installcheck"
+
+	# Diagnostic: snapshot whether every sub is back to 'replicating'.
+	# This is just a snapshot -- apply workers can flicker through
+	# 'down' for a few seconds after installcheck while their
+	# restart_delay window elapses, so the result is logged but does
+	# NOT decide the test.  sync_event below is what really matters.
+	local post_wait_rc=0
+	wait_for_mesh_replicating "${WAIT_REPLICATING_TIMEOUT_POST}" \
+		|| post_wait_rc=$?
+	if [ "${post_wait_rc}" -ne 0 ]; then
+		log "diagnostic: status!='replicating' on some subs after ${WAIT_REPLICATING_TIMEOUT_POST}s (sync_event below is the authority)"
+	fi
+
+	# Authoritative end-to-end check: sync_event proves bytes actually
+	# round-trip from each provider to every other node.  Always run
+	# this, regardless of the status snapshot above -- a sub that
+	# briefly flickered 'down' but is now caught up will deliver
+	# sync_event just fine, and the snapshot's verdict is a false
+	# negative we should not honor.
+	local sync_rc=0
+	check_sync_event_propagation || sync_rc=$?
+
+	local verify_rc=0
+	verify_subs_enabled || verify_rc=$?
+
+	# Show the user the actual SELECT output for every subscription on
+	# every node BEFORE stopping the nodes (psql can't talk to a stopped
+	# server).
+	print_subscription_state_to_screen
+
+	# And the connection params, so the user can attach if they like.
+	# Printed before stop_all_nodes so the info is valid right now in
+	# the --keep case.
+	print_connection_params
+
+	if [ "${KEEP_RUNNING}" -eq 0 ]; then
+		stop_all_nodes
+	else
+		log "--keep set: leaving nodes running. Sockets under ${SOCK_DIR}"
+	fi
+
+	# Two-pronged decision:
+	#   (1) every sub_enabled = true            (admin flag survived)
+	#   (2) sync_event round-trips on every edge (bytes actually flow)
+	# The status='replicating' snapshot is diagnostic only -- it can be
+	# a false negative right after installcheck while apply workers
+	# work through their restart_delay window.
+	if [ "${verify_rc}" -ne 0 ] || [ "${sync_rc}" -ne 0 ]; then
+		local reason=
+		[ "${verify_rc}" -ne 0 ] \
+			&& reason="some subscriptions disabled"
+		[ "${sync_rc}" -ne 0 ] \
+			&& reason="${reason:+${reason}; }sync_event did not propagate on some edges (see ${LOG_DIR}/sync-event-check.log)"
+		log "RESULT: ${reason}"
+		say "RESULT: FAIL -- ${reason} (see output above)"
+		return 2
+	fi
+	log "RESULT: every sub enabled, sync_event round-trips on every edge"
+	say "RESULT: PASS -- mesh healthy after installcheck (sub_enabled + sync_event)"
+	return 0
+}
+
+main "$@"