From ed892a80c342cf489ed40d851927b621f2e9a79d Mon Sep 17 00:00:00 2001 From: "Andrei V. Lepikhov" Date: Thu, 28 May 2026 08:48:42 +0200 Subject: [PATCH] tests: add multi-PG mesh installcheck Builds PG REL_15/16/17/18_STABLE + Spock against each in parallel, applies patches//*.diff, wires four single-node clusters into a 12-subscription full mesh (exception_behaviour='discard', auto-DDL on), stresses it with `make installcheck-parallel` on n1 (PG15), then verifies the mesh survived. Success requires both: - sub_enabled = true on every subscription, and - spock.sync_event() round-trips from each provider to every other node within 10s -- the authoritative end-to-end check. The status='replicating' snapshot is also polled but diagnostic only, because apply workers can flicker 'down' through their restart_delay window right after installcheck. Layout (all under multi-pg-installcheck/ in the repo, gitignored): src/pgNN, bin/pgNN, spock-build/pgNN, pgdata/nN, sock/, pid/ log/.log per-builder summary log/-.log raw output of each command log/main.log orchestrator events log/installcheck.log make installcheck output log/sync-event-check.log per-edge wait_for_sync_event output Bash 3.2 compatible (macOS /bin/bash). Re-runs reuse existing PG and Spock binaries by default; pass --force to rebuild. Terminal output is one OK/FAILED line per phase plus the final RESULT, with the raw psql state of every node printed at the end. Companion workflow .github/workflows/installcheck-multi-pg.yml runs on push and workflow_dispatch; needs only bison + flex on top of ubuntu-latest. --- .github/workflows/installcheck-multi-pg.yml | 81 ++ .gitignore | 7 + tests/run-multi-pg-installcheck.sh | 1109 +++++++++++++++++++ 3 files changed, 1197 insertions(+) create mode 100644 .github/workflows/installcheck-multi-pg.yml create mode 100755 tests/run-multi-pg-installcheck.sh diff --git a/.github/workflows/installcheck-multi-pg.yml b/.github/workflows/installcheck-multi-pg.yml new file mode 100644 index 00000000..3940ad8e --- /dev/null +++ b/.github/workflows/installcheck-multi-pg.yml @@ -0,0 +1,81 @@ +# +# Multi-PG Spock mesh installcheck (no Docker) +# +# Builds PostgreSQL REL_15/16/17/18_STABLE plus the Spock extension against +# each, wires the four single-node clusters into a full Spock mesh +# (12 subscriptions, exception_behaviour='discard', auto-DDL on), +# stresses it with `make installcheck-parallel` against n1 (PG15), +# then asserts that every subscription is still enabled and that +# spock.sync_event() round-trips on every directed edge. +# +# The whole thing is driven by tests/run-multi-pg-installcheck.sh, so +# it runs identically on a developer laptop. This workflow's job is +# just: provision deps, invoke the script, save logs on failure. +# +# Dependency footprint is dictated by the script's ./configure flags +# (see _do_configure_pg in tests/run-multi-pg-installcheck.sh): +# +# --with-icu -> libicu-dev +# --with-openssl -> libssl-dev +# --with-readline -> libreadline-dev +# --with-zstd -> libzstd-dev +# --with-lz4 -> liblz4-dev +# (default zlib) -> zlib1g-dev +# parser/scanner -> bison, flex +# ICU pkg detect -> pkg-config (preinstalled, listed for clarity) +# +# Everything else the script touches -- gcc, make, perl, git, rsync, +# ca-certificates -- is preinstalled on ubuntu-latest. If you change +# the configure command in the script, sync this list with it. +# + +name: Installcheck (multi-PG mesh) +run-name: Multi-PG mesh installcheck + sync_event verification + +on: + workflow_dispatch: # manual: "Run workflow" button in the UI + push: # automatic: every commit on every branch + paths-ignore: + - '**/*.md' + - 'docs/**' + - 'mkdocs.yml' + +permissions: + contents: read + +jobs: + installcheck-multi-pg: + runs-on: ubuntu-latest + timeout-minutes: 90 + + steps: + - name: Checkout spock + uses: actions/checkout@v4 + + - name: Install build dependencies + run: | + sudo apt-get update -qq + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \ + --no-install-recommends \ + bison flex pkg-config \ + libicu-dev libssl-dev libreadline-dev \ + libzstd-dev liblz4-dev zlib1g-dev + + - name: Run multi-PG installcheck + run: | + # BASE_DIR defaults to /multi-pg-installcheck which in CI + # is ${GITHUB_WORKSPACE}/multi-pg-installcheck -- exactly the + # paths the upload-artifact step below points at. + ./tests/run-multi-pg-installcheck.sh --jobs "$(nproc)" + + - name: Collect logs on failure + if: ${{ failure() }} + uses: actions/upload-artifact@v4 + with: + name: installcheck-multi-pg-logs + path: | + multi-pg-installcheck/log/** + multi-pg-installcheck/src/pg15/src/test/regress/regression.diffs + multi-pg-installcheck/src/pg15/src/test/regress/regression.out + if-no-files-found: ignore + retention-days: 7 diff --git a/.gitignore b/.gitignore index fbe0df49..de083897 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,10 @@ spock.control tags .vscode tmp/ + +# Multi-PG installcheck test rig (tests/run-multi-pg-installcheck.sh). +# Contains per-version PG clones, builds, installs, PGDATA dirs and logs; +# can grow to several gigabytes. Always ignored -- re-create by running +# the script. Deliberately not dot-prefixed so the directory is visible +# in Finder / ls and easy to inspect. +multi-pg-installcheck/ diff --git a/tests/run-multi-pg-installcheck.sh b/tests/run-multi-pg-installcheck.sh new file mode 100755 index 00000000..14513176 --- /dev/null +++ b/tests/run-multi-pg-installcheck.sh @@ -0,0 +1,1109 @@ +#!/usr/bin/env bash +# +# tests/run-multi-pg-installcheck.sh +# +# Build PostgreSQL REL_{15,16,17,18}_STABLE plus the Spock extension against +# each, start four single-node clusters wired into a full Spock mesh +# (12 subscriptions, exception_behaviour='discard', auto-DDL on), run +# `make installcheck` against node n1 (PG15), and verify every subscription +# is still enabled afterwards. +# +# Compatible with bash 3.2 (macOS /bin/bash) -- no associative arrays. +# +# The four per-node builders (clone PG, build PG, build Spock, initdb, start) +# run as parallel subprocesses; the main process waits on all of them, then +# probes each with pg_isready before moving on to the Spock wiring. +# +# Designed to run both locally and inside a GitHub Actions workflow on +# ubuntu-latest. No Docker. +# +# Layout (under BASE_DIR, default /multi-pg-installcheck): +# src/pg15..pg18 PG source clones (one branch each) +# bin/pg15..pg18 PG installs (configure --prefix) +# spock-build/pg15..18 per-version Spock source copy + build artefacts +# pgdata/n1..n4 PGDATA per node +# log/ per-instance log files: +# .log summary (one line per phase) +# -pg-clone.log git clone output +# -pg-patch.log git apply of patches//*.diff +# -pg-configure.log ./configure output +# -pg-build.log make + make install + regress install +# -spock-build.log spock make + install +# -initdb.log initdb output +# -pg-start.log pg_ctl start output +# -server.log postgres server log +# -createdb.log createdb regression +# -spock-bootstrap.log CREATE EXTENSION + node_create + sub_create's +# installcheck.log make installcheck output +# sync-event-check.log per-edge wait_for_sync_event output +# sock/ unix-socket dir shared by all nodes +# pid/ per-builder PID files +# +# Node mapping: +# n1 -> PG15, port 57515 +# n2 -> PG16, port 57516 +# n3 -> PG17, port 57517 +# n4 -> PG18, port 57518 +# +# Subscription naming: +# sub__; e.g. sub_n1_n2 lives on n2 and pulls from n1. +# +# Usage: +# tests/run-multi-pg-installcheck.sh [--base-dir DIR] [--keep] \ +# [--force] [--jobs N] +# +# Existing PG installs (bin/postgres present) and Spock installs +# (extension/spock.control present) are reused by default to speed up +# re-runs. Pass --force to rebuild everything from scratch. +# +# Exit status: +# 0 every Spock subscription on every node is still enabled +# (sub_enabled = true) AND a spock.sync_event() fired on each +# provider was applied on every other node within +# SYNC_EVENT_TIMEOUT seconds. sync_event is the authoritative +# end-to-end check -- it proves bytes round-trip from provider +# to subscriber. status='replicating' is also polled (timeout +# WAIT_REPLICATING_TIMEOUT_POST), but only as diagnostic info: +# apply workers can briefly flicker 'down' through their +# restart_delay window right after installcheck, and a sub that +# recovers will still deliver sync_event correctly. +# Regression-suite pass/fail is logged but does NOT influence +# the exit code -- the installcheck workload is used as stress; +# the only success signal is the surviving mesh. +# 2 one or more subscriptions ended up disabled, OR +# sync_event failed to propagate on at least one edge within +# SYNC_EVENT_TIMEOUT seconds. +# >2 build / setup error. +# + +# Deliberately NOT using `-E` (errtrace). With -E the ERR trap is +# inherited into command substitutions and other subshells; combined +# with on_err's old cleanup logic, a single transient psql failure +# inside something like `n="$(psql_on ...)"` was enough to silently +# shut down every cluster. Without -E, ERR fires only at the main +# shell level, subshell failures propagate normally via return codes, +# and the trap stays a single-shot abort handler. +set -euo pipefail + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SPOCK_SRC="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Default base dir lives inside the spock repo so all artefacts are +# contained next to the source. This means the script can be launched +# from anywhere without scattering data across the filesystem; it also +# means re-runs from a different cwd reuse the same clones/builds/data. +# Overridden by --base-dir. +BASE_DIR="${SPOCK_SRC}/multi-pg-installcheck" +KEEP_RUNNING=0 +# Reuse is the default; --force flips this on to rebuild everything. +FORCE_REBUILD=0 +# Total parallelism for ./configure && make; we run 4 builders in parallel +# so the per-builder concurrency is JOBS_PER_BUILDER. +JOBS_TOTAL="$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" + +PG_VERSIONS="15 16 17 18" # space-separated; iterated with `for ver in` +NODES="n1 n2 n3 n4" + +DBNAME=regression +DBUSER=regression +TARGET_NODE=n1 # node we run `make installcheck` against + +# Try a couple of git remotes; the first one that resolves wins. Use a +# fallback so the script keeps working if the official postgresql.org host +# is unreachable from the runner (e.g. behind a strict outbound proxy). +PG_GIT_REMOTES="https://git.postgresql.org/git/postgresql.git https://github.com/postgres/postgres.git" + +# --------------------------------------------------------------------------- +# Bash-3-safe lookup helpers (no `declare -A`) +# --------------------------------------------------------------------------- + +ver_to_node() { + case "$1" in + 15) echo n1 ;; + 16) echo n2 ;; + 17) echo n3 ;; + 18) echo n4 ;; + *) return 1 ;; + esac +} + +node_to_ver() { + case "$1" in + n1) echo 15 ;; + n2) echo 16 ;; + n3) echo 17 ;; + n4) echo 18 ;; + *) return 1 ;; + esac +} + +ver_to_port() { + case "$1" in + 15) echo 57515 ;; + 16) echo 57516 ;; + 17) echo 57517 ;; + 18) echo 57518 ;; + *) return 1 ;; + esac +} + +node_to_port() { ver_to_port "$(node_to_ver "$1")"; } + +# --------------------------------------------------------------------------- +# Logging / error trap +# --------------------------------------------------------------------------- + +# log() writes only to disk -- the terminal stays clean. Output goes to +# the caller's $NODE_LOG when set (per-builder summary file), otherwise +# to $MAIN_LOG (the orchestrator's own log). Both files live under +# $LOG_DIR; everything the user might want to read after the run lands +# in one of them. +log() { + local msg + msg="[$(date +%H:%M:%S)] $*" + if [ -n "${NODE_LOG:-}" ]; then + printf '%s\n' "${msg}" >>"${NODE_LOG}" + elif [ -n "${MAIN_LOG:-}" ]; then + printf '%s\n' "${msg}" >>"${MAIN_LOG}" + fi +} + +# say() is for the few things the user must see on the terminal: the +# final RESULT line and fatal errors. Use sparingly. +say() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } + +fail() { say "FATAL: $1"; log "FATAL: $1"; exit "${2:-3}"; } + +# run_phase NODE PHASE CMD ARGS... +# Runs CMD with stdout+stderr captured to ${LOG_DIR}/-.log. +# Records start/finish in the per-node summary log AND emits a single +# end-of-phase OK/FAILED line on the terminal via say(), so the user +# gets a per-step heartbeat without seeing any command output itself. +run_phase() { + local node="$1" phase="$2" + shift 2 + local logf="${LOG_DIR}/${node}-${phase}.log" + log "${node}: [${phase}] start -> ${logf}" + local rc=0 + # `|| rc=$?` captures the real exit code and prevents set -e from + # tripping inside this checked context. + "$@" >"${logf}" 2>&1 || rc=$? + if [ "${rc}" -ne 0 ]; then + log "${node}: [${phase}] FAILED rc=${rc} (see ${logf})" + say "${node}: ${phase} FAILED rc=${rc} (see ${logf})" + return "${rc}" + fi + log "${node}: [${phase}] ok" + say "${node}: ${phase} ok" +} + +trap 'on_err $? $LINENO' ERR + +on_err() { + local rc=$1 line=$2 + log "Aborted: exit ${rc} at line ${line}" + dump_logs_on_failure || true + # Deliberately NO stop_all_nodes here. It used to live in this + # trap, but combined with set -E it would fire from inside command + # substitutions and silently kill every cluster on a single + # transient psql hiccup. Teardown lives in main()'s normal flow + # (where it has the correct context); kill_outstanding_builders is + # still safe to call here because it operates on PIDs we know are + # ours. + kill_outstanding_builders || true + exit "${rc}" +} + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- + +# Print the entire leading comment block (everything from line 2 up to, +# but not including, the first non-comment line). Resilient to future +# growth of the doc header. +usage() { + awk 'NR>1 { if ($0 !~ /^#/) exit; print }' "$0" +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --base-dir) BASE_DIR="$2"; shift 2 ;; + --keep) KEEP_RUNNING=1; shift ;; + --force) FORCE_REBUILD=1; shift ;; + --jobs) JOBS_TOTAL="$2"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) fail "unknown argument: $1" 4 ;; + esac +done + +mkdir -p "${BASE_DIR}/src" \ + "${BASE_DIR}/bin" \ + "${BASE_DIR}/spock-build" \ + "${BASE_DIR}/pgdata" \ + "${BASE_DIR}/log" \ + "${BASE_DIR}/sock" \ + "${BASE_DIR}/pid" +BASE_DIR="$(cd "${BASE_DIR}" && pwd)" +SOCK_DIR="${BASE_DIR}/sock" +LOG_DIR="${BASE_DIR}/log" +PID_DIR="${BASE_DIR}/pid" + +# Fresh log and pid directories per run: stale lines from a previous +# run would otherwise commingle with new output and make diagnosis +# painful. Deliberately scoped to log/ and pid/ -- src/, bin/, +# spock-build/, and pgdata/ are preserved so reuse-on-rerun still works. +rm -rf "${LOG_DIR}" "${PID_DIR}" +mkdir -p "${LOG_DIR}" "${PID_DIR}" + +MAIN_LOG="${LOG_DIR}/main.log" +: >"${MAIN_LOG}" # fresh orchestrator log per run + +# Per-builder concurrency: divide total jobs by 4 builders, minimum 1. +JOBS_PER_BUILDER=$(( JOBS_TOTAL / 4 )) +[ "${JOBS_PER_BUILDER}" -lt 1 ] && JOBS_PER_BUILDER=1 + +log "BASE_DIR = ${BASE_DIR}" +log "SPOCK_SRC = ${SPOCK_SRC}" +log "JOBS_TOTAL = ${JOBS_TOTAL}" +log "JOBS_PER_BUILDER = ${JOBS_PER_BUILDER}" +log "PG_VERSIONS = ${PG_VERSIONS}" + +# --------------------------------------------------------------------------- +# Path helpers +# --------------------------------------------------------------------------- + +prefix_for() { echo "${BASE_DIR}/bin/pg$1"; } +src_for() { echo "${BASE_DIR}/src/pg$1"; } +spock_build_for() { echo "${BASE_DIR}/spock-build/pg$1"; } +data_for() { echo "${BASE_DIR}/pgdata/$1"; } +pg_config_for() { echo "$(prefix_for "$1")/bin/pg_config"; } + +# DSN that talks over the shared Unix socket directory. +dsn_for_node() { + local node="$1" + local port; port="$(node_to_port "${node}")" + echo "host=${SOCK_DIR} port=${port} dbname=${DBNAME} user=${DBUSER}" +} + +# Run psql against a node using that node's own client binaries. +psql_on() { + local node="$1"; shift + local ver; ver="$(node_to_ver "${node}")" + local prefix; prefix="$(prefix_for "${ver}")" + local port; port="$(node_to_port "${node}")" + PGPASSWORD="" "${prefix}/bin/psql" \ + -X -v ON_ERROR_STOP=1 \ + -h "${SOCK_DIR}" -p "${port}" \ + -U "${DBUSER}" -d "${DBNAME}" \ + "$@" +} + +# --------------------------------------------------------------------------- +# Builder: clone + build PG + build Spock + initdb + start (one node) +# --------------------------------------------------------------------------- + +# Pick the first reachable git remote for PG. +pick_pg_remote() { + local r + for r in ${PG_GIT_REMOTES}; do + # `git ls-remote` is cheap and validates reachability for our branch. + if git ls-remote --exit-code --heads "${r}" "REL_$1_STABLE" \ + >/dev/null 2>&1; then + echo "${r}" + return 0 + fi + done + return 1 +} + +_do_clone_pg() { + local ver="$1" remote="$2" + local branch="REL_${ver}_STABLE" + local src; src="$(src_for "${ver}")" + rm -rf "${src}" + git clone --depth=1 --single-branch --branch "${branch}" \ + "${remote}" "${src}" +} + +clone_pg() { + local ver="$1" + local node; node="$(ver_to_node "${ver}")" + local branch="REL_${ver}_STABLE" + local src; src="$(src_for "${ver}")" + + if [ -d "${src}/.git" ] \ + && [ -f "${src}/src/test/regress/parallel_schedule" ]; then + log "${node}: [pg-clone] PG${ver} source already present, skipping" + return 0 + fi + + local remote + remote="$(pick_pg_remote "${ver}")" \ + || fail "PG${ver}: no reachable git remote for ${branch}" 5 + + log "${node}: [pg-clone] PG${ver} ${branch} from ${remote}" + run_phase "${node}" pg-clone _do_clone_pg "${ver}" "${remote}" +} + +# Spock needs Postgres with per-version patches applied (they add the +# symbols spock_apply.c uses, e.g. remoteTransactionStopTimestamp and +# SubTransactionIdSetCommitTsData). Patches live in patches// in +# the spock tree and are applied in lexical order via `git apply`. +# +# git apply is used (not `patch`) because: +# - it reads the same unified-diff format as the existing patches//*.diff +# - it's atomic per patch: a failed apply leaves the tree untouched +# - it understands rename/mode metadata correctly when we ever add such patches +# +# A marker file makes the phase idempotent across re-runs. +_do_patch_pg() { + local ver="$1" src="$2" patch_dir="$3" + if [ ! -d "${patch_dir}" ]; then + echo "no patch directory ${patch_dir} -- nothing to do" + return 0 + fi + local p any=0 + # `set -- patches/*.diff` — bash sorts lexicographically already. + for p in "${patch_dir}"/*.diff "${patch_dir}"/*.patch; do + [ -f "${p}" ] || continue + any=1 + echo "----- applying $(basename "${p}") -----" + # git apply needs to run from within the PG source tree. + ( cd "${src}" && git apply --whitespace=nowarn -p1 "${p}" ) + done + if [ "${any}" -eq 0 ]; then + echo "no .diff/.patch files in ${patch_dir}" + fi + touch "${src}/.spock-patches-applied" +} + +patch_pg() { + local ver="$1" + local node; node="$(ver_to_node "${ver}")" + local src; src="$(src_for "${ver}")" + local patch_dir="${SPOCK_SRC}/patches/${ver}" + + if [ -f "${src}/.spock-patches-applied" ]; then + log "${node}: [pg-patch] patches already applied (marker present), skipping" + return 0 + fi + run_phase "${node}" pg-patch _do_patch_pg "${ver}" "${src}" "${patch_dir}" +} + +_do_configure_pg() { + local src="$1" prefix="$2" + cd "${src}" + ./configure --prefix="${prefix}" --enable-debug --enable-cassert \ + --with-icu --with-openssl --with-readline --with-zstd --with-lz4 +} + +_do_build_pg() { + local src="$1" jobs="$2" + make -C "${src}" -s -j"${jobs}" + make -C "${src}" -s -j"${jobs}" install + # Install pg_regress so `make installcheck` later can find it via $bindir. + make -C "${src}/src/test/regress" -s install +} + +build_pg() { + local ver="$1" + local node; node="$(ver_to_node "${ver}")" + local src; src="$(src_for "${ver}")" + local prefix; prefix="$(prefix_for "${ver}")" + + # Reuse an existing PG install unless --force was passed. + if [ "${FORCE_REBUILD}" -eq 0 ] && [ -x "${prefix}/bin/postgres" ]; then + log "${node}: [pg-build] reusing existing install at ${prefix}" + return 0 + fi + + run_phase "${node}" pg-configure _do_configure_pg "${src}" "${prefix}" + run_phase "${node}" pg-build _do_build_pg "${src}" "${JOBS_PER_BUILDER}" +} + +# Each builder gets its own copy of the Spock source tree and builds in +# it independently. No shared state between builders => no mutex, no +# coordination -- all four spock builds run truly in parallel, same as +# the PG builds. +# +# rsync is used because its `--exclude='/X'` is anchored to the source +# root identically on GNU and BSD systems; the leading '/' is what keeps +# us from copying BASE_DIR (which lives inside SPOCK_SRC) into itself. +_do_build_spock() { + local sb="$1" pgc="$2" jobs="$3" + rm -rf "${sb}" + mkdir -p "${sb}" + rsync -a \ + --exclude='/multi-pg-installcheck' \ + --exclude='/.git' \ + --exclude='.DS_Store' \ + "${SPOCK_SRC}/" "${sb}/" + make -C "${sb}" PG_CONFIG="${pgc}" -j"${jobs}" + make -C "${sb}" PG_CONFIG="${pgc}" install +} + +build_spock() { + local ver="$1" + local node; node="$(ver_to_node "${ver}")" + local pgc; pgc="$(pg_config_for "${ver}")" + local sb; sb="$(spock_build_for "${ver}")" + + # Reuse an already-installed Spock unless --force was passed. + if [ "${FORCE_REBUILD}" -eq 0 ] \ + && [ -f "$("${pgc}" --sharedir)/extension/spock.control" ]; then + log "${node}: [spock-build] reusing existing install for PG${ver}" + return 0 + fi + + run_phase "${node}" spock-build _do_build_spock \ + "${sb}" "${pgc}" "${JOBS_PER_BUILDER}" +} + +_do_initdb() { + local prefix="$1" data="$2" user="$3" + "${prefix}/bin/initdb" -D "${data}" -U "${user}" \ + --encoding=UTF8 --locale=C +} + +init_node() { + local node="$1" + local ver; ver="$(node_to_ver "${node}")" + local prefix; prefix="$(prefix_for "${ver}")" + local data; data="$(data_for "${node}")" + local port; port="$(ver_to_port "${ver}")" + + if [ -d "${data}" ]; then + log "${node}: [initdb] clearing existing data dir" + rm -rf "${data}" + fi + run_phase "${node}" initdb _do_initdb "${prefix}" "${data}" "${DBUSER}" + + cat >>"${data}/postgresql.conf" <<-EOF + # --- multi-PG installcheck test rig --- + listen_addresses = '' + unix_socket_directories = '${SOCK_DIR}' + port = ${port} + max_connections = 200 + + wal_level = logical + track_commit_timestamp = on + max_worker_processes = 32 + max_replication_slots = 32 + max_wal_senders = 32 + + log_min_messages = 'log' + log_statement = 'none' + logging_collector = off + + shared_preload_libraries = 'spock' + spock.conflict_resolution = 'last_update_wins' + spock.exception_behaviour = 'discard' + spock.save_resolutions = on + + # Replicate DDL automatically across the mesh, and add any tables + # created by that DDL to the default replication set so they + # actually start flowing through the subscriptions. + spock.enable_ddl_replication = on + spock.include_ddl_repset = on + spock.allow_ddl_from_functions = on + EOF + + # Trust on the shared Unix socket; no TCP listener so this is local-only. + cat >>"${data}/pg_hba.conf" <<-EOF + local all all trust + local replication all trust + EOF +} + +_do_pg_ctl_start() { + local prefix="$1" data="$2" server_log="$3" + "${prefix}/bin/pg_ctl" -D "${data}" -l "${server_log}" -w -t 60 start +} + +start_node() { + local node="$1" + local ver; ver="$(node_to_ver "${node}")" + local prefix; prefix="$(prefix_for "${ver}")" + local data; data="$(data_for "${node}")" + run_phase "${node}" pg-start _do_pg_ctl_start \ + "${prefix}" "${data}" "${LOG_DIR}/${node}-server.log" +} + +stop_node() { + local node="$1" + local ver; ver="$(node_to_ver "${node}")" + local prefix; prefix="$(prefix_for "${ver}")" + local data; data="$(data_for "${node}")" + if [ -f "${data}/postmaster.pid" ]; then + log "${node}: pg_ctl stop" + "${prefix}/bin/pg_ctl" -D "${data}" -m fast -w -t 60 stop || true + fi +} + +stop_all_nodes() { + local node + for node in ${NODES}; do stop_node "${node}"; done +} + +# The whole per-node pipeline -- runs as a backgrounded subprocess. +# Sets NODE_LOG so every log() call inside this subshell additionally +# appends to ${LOG_DIR}/.log (the per-instance summary). +builder_for_node() { + local ver="$1" + local node; node="$(ver_to_node "${ver}")" + NODE_LOG="${LOG_DIR}/${node}.log" + : >"${NODE_LOG}" + log "${node}: builder starting for PG${ver} (pid $$)" + # Builder subshell: plain set -e is enough; we don't want -E for the + # same reason main shell doesn't (subshell trap cascade). + set -euo pipefail + clone_pg "${ver}" + patch_pg "${ver}" + build_pg "${ver}" + build_spock "${ver}" + init_node "${node}" + start_node "${node}" + log "${node}: builder finished OK" +} + +# --------------------------------------------------------------------------- +# Launch all four builders in parallel and wait +# --------------------------------------------------------------------------- + +launch_builders() { + local ver node pid + rm -f "${PID_DIR}"/*.pid 2>/dev/null || true + for ver in ${PG_VERSIONS}; do + node="$(ver_to_node "${ver}")" + log "${node}: launching builder for PG${ver} -> ${LOG_DIR}/${node}.log" + # No outer redirect: each phase routes its own output to a + # ${LOG_DIR}/-.log file, and the script's own log() + # calls tee into ${LOG_DIR}/.log via $NODE_LOG. Anything + # that escapes those falls through to the parent's stderr, which + # is exactly what we want for live progress. + ( + builder_for_node "${ver}" + ) & + pid=$! + echo "${pid}" >"${PID_DIR}/${node}.pid" + log "${node}: builder pid=${pid}" + done +} + +# Wait for each builder; return non-zero if any failed. +wait_for_builders() { + local ver node pid rc overall=0 + for ver in ${PG_VERSIONS}; do + node="$(ver_to_node "${ver}")" + pid="$(cat "${PID_DIR}/${node}.pid")" + # Don't trip the ERR trap if a builder dies; we want to summarise. + set +e + wait "${pid}" + rc=$? + set -e + if [ ${rc} -eq 0 ]; then + log "${node}: builder OK" + else + overall=1 + log "${node}: builder FAILED rc=${rc} (see ${LOG_DIR}/${node}.log)" + fi + done + return ${overall} +} + +kill_outstanding_builders() { + local f pid + for f in "${PID_DIR}"/*.pid; do + [ -f "${f}" ] || continue + pid="$(cat "${f}" 2>/dev/null || true)" + [ -n "${pid}" ] || continue + if kill -0 "${pid}" 2>/dev/null; then + kill -TERM "${pid}" 2>/dev/null || true + fi + done +} + +# --------------------------------------------------------------------------- +# pg_isready probe for each node +# --------------------------------------------------------------------------- + +wait_for_ready() { + local node="$1" + local ver; ver="$(node_to_ver "${node}")" + local prefix; prefix="$(prefix_for "${ver}")" + local port; port="$(ver_to_port "${ver}")" + local deadline=$(( $(date +%s) + 60 )) + while [ "$(date +%s)" -lt "${deadline}" ]; do + if "${prefix}/bin/pg_isready" -q \ + -h "${SOCK_DIR}" -p "${port}" -d "${DBNAME}" -U "${DBUSER}"; then + log "${node}: pg_isready OK" + return 0 + fi + sleep 1 + done + log "${node}: pg_isready did not become ready within 60s" + return 1 +} + +wait_for_all_ready() { + local node rc=0 + for node in ${NODES}; do + wait_for_ready "${node}" || rc=1 + done + return ${rc} +} + +# --------------------------------------------------------------------------- +# DB + Spock bootstrap (after all servers are up) +# --------------------------------------------------------------------------- + +_do_createdb() { + local prefix="$1" port="$2" user="$3" dbname="$4" sock="$5" + "${prefix}/bin/createdb" -h "${sock}" -p "${port}" \ + -U "${user}" -O "${user}" "${dbname}" +} + +create_db_for_node() { + local node="$1" + local ver; ver="$(node_to_ver "${node}")" + local prefix; prefix="$(prefix_for "${ver}")" + local port; port="$(ver_to_port "${ver}")" + run_phase "${node}" createdb _do_createdb \ + "${prefix}" "${port}" "${DBUSER}" "${DBNAME}" "${SOCK_DIR}" +} + +setup_spock_node() { + local node="$1" + local logf="${LOG_DIR}/${node}-spock-bootstrap.log" + log "${node}: [spock-bootstrap] CREATE EXTENSION + node_create -> ${logf}" + { + psql_on "${node}" -c "CREATE EXTENSION IF NOT EXISTS spock;" + psql_on "${node}" <<-SQL + SELECT spock.node_create( + node_name := '${node}', + dsn := '$(dsn_for_node "${node}")' + ); + SQL + } >>"${logf}" 2>&1 +} + +create_subscription() { + local provider="$1" subscriber="$2" + local subname="sub_${provider}_${subscriber}" + local provider_dsn; provider_dsn="$(dsn_for_node "${provider}")" + local logf="${LOG_DIR}/${subscriber}-spock-bootstrap.log" + + log "${subscriber}: [spock-bootstrap] sub_create ${subname} <- ${provider}" + psql_on "${subscriber}" >>"${logf}" 2>&1 <<-SQL + SELECT spock.sub_create( + subscription_name := '${subname}', + provider_dsn := '${provider_dsn}', + synchronize_structure := false, + synchronize_data := false, + forward_origins := '{}'::text[], + enabled := true + ); + SQL +} + +wire_full_mesh() { + local subscriber provider + for subscriber in ${NODES}; do + for provider in ${NODES}; do + [ "${provider}" = "${subscriber}" ] && continue + create_subscription "${provider}" "${subscriber}" + done + done +} + +# --------------------------------------------------------------------------- +# Run `make installcheck` against the target node +# --------------------------------------------------------------------------- + +run_installcheck() { + local node="${TARGET_NODE}" + local ver; ver="$(node_to_ver "${node}")" + local src; src="$(src_for "${ver}")" + local prefix; prefix="$(prefix_for "${ver}")" + local port; port="$(ver_to_port "${ver}")" + + log "${node} (PG${ver}): make installcheck-parallel" + + # --use-existing is critical: without it, pg_regress starts the run by + # issuing DROP DATABASE regression / CREATE DATABASE regression , and + # the DROP fails the moment Spock has any live replication slot into + # this database: + # + # ERROR: database "regression" is used by an active logical + # replication slot + # + # With --use-existing pg_regress reuses the database we prepared and + # leaves it intact when the run finishes, which is also what + # verify_subs_enabled needs to read the post-installcheck state. + set +e + ( + cd "${src}/src/test/regress" + PATH="${prefix}/bin:${PATH}" \ + PGHOST="${SOCK_DIR}" PGPORT="${port}" \ + PGUSER="${DBUSER}" PGDATABASE="${DBNAME}" \ + make -k installcheck-parallel \ + USE_INSTALLED=1 \ + EXTRA_REGRESS_OPTS="--use-existing --host=${SOCK_DIR} --port=${port} --user=${DBUSER}" + ) >"${LOG_DIR}/installcheck.log" 2>&1 + local rc=$? + set -e + + # Regression-suite failures are expected -- Spock's auto-DDL + # replication will mangle the test fixtures. We log the outcome + # for diagnostics but always return success: the only failure + # signal that matters is verify_subs_enabled(). + if [ ${rc} -ne 0 ]; then + log "installcheck completed with regression failures (exit ${rc})" + log " see ${LOG_DIR}/installcheck.log and ${src}/src/test/regress/regression.diffs" + else + log "installcheck completed cleanly (no regression diffs)" + fi + return 0 +} + +# --------------------------------------------------------------------------- +# Wait for every sub to report status='replicating' +# --------------------------------------------------------------------------- + +# Default timeouts for the two callers of wait_for_mesh_replicating. +# PRE -- right after wire_full_mesh. Generous: slot creation + +# walreceiver startup may take a moment on a slow box. +# POST -- right after installcheck. Diagnostic only: apply workers +# may go through one or two restart_delay cycles after a +# heavy run, so this snapshot is informational; sync_event +# is what actually decides the test. +WAIT_REPLICATING_TIMEOUT_PRE=60 +WAIT_REPLICATING_TIMEOUT_POST=30 + +# Same idea as tests/regress/sql/alter_options.sql:wait_replicating(), but +# evaluated from the bash side so we can iterate across all four nodes. +# A sub's status comes from spock.sub_show_status(); the possible values +# are 'replicating', 'down', 'init', 'unknown' (see +# src/spock_apply.c:get_subscription_status). Caller supplies the timeout +# in seconds; returns 0 if every sub on every node reaches 'replicating' +# within that window, non-zero on timeout. +wait_for_mesh_replicating() { + local timeout="${1:-${WAIT_REPLICATING_TIMEOUT_PRE}}" + local deadline node not_replicating + deadline=$(( $(date +%s) + timeout )) + while [ "$(date +%s)" -lt "${deadline}" ]; do + not_replicating=0 + for node in ${NODES}; do + local n + n="$(psql_on "${node}" -At -c \ + "SELECT count(*) FROM spock.sub_show_status() \ + WHERE status IS DISTINCT FROM 'replicating';" \ + 2>/dev/null)" || n=999 + if [ "${n}" -ne 0 ]; then + not_replicating=1 + break + fi + done + if [ "${not_replicating}" -eq 0 ]; then + log "all 12 subscriptions reached status='replicating' (within ${timeout}s)" + return 0 + fi + sleep 1 + done + log "timed out after ${timeout}s waiting for subs to reach 'replicating'" + return 1 +} + +# Snapshot the full subscription state on every node into per-node logs. +# Called twice (pre- and post-installcheck) so we have a before/after +# diff if something regresses. +print_subscription_state() { + local label="$1" + local node + for node in ${NODES}; do + local nlog="${LOG_DIR}/${node}.log" + { + printf '\n--- subscription state (%s) on %s ---\n' "${label}" "${node}" + psql_on "${node}" -P null='(null)' \ + -c "SELECT sub_name, sub_enabled FROM spock.subscription + ORDER BY sub_name;" \ + -c "SELECT subscription_name, status, provider_node, slot_name + FROM spock.sub_show_status() + ORDER BY subscription_name;" + } >>"${nlog}" 2>&1 || true + done +} + +# Print the connection parameters for every node so the user can attach +# from another terminal. Useful primarily with --keep, but also handy +# as a historical record otherwise (re-launchable next time). +print_connection_params() { + local node ver port prefix + printf '\n=== Connection parameters ===\n' >&2 + if [ "${KEEP_RUNNING}" -eq 1 ]; then + printf '(servers are left running -- you can attach right now)\n' >&2 + else + printf '(servers will be stopped on script exit; re-run with --keep to keep them up)\n' >&2 + fi + for node in ${NODES}; do + ver="$(node_to_ver "${node}")" + port="$(ver_to_port "${ver}")" + prefix="$(prefix_for "${ver}")" + # Two forms for convenience: one human-readable, one ready to paste. + printf '\n %s (PG%s)\n' "${node}" "${ver}" >&2 + printf ' host = %s\n' "${SOCK_DIR}" >&2 + printf ' port = %s\n' "${port}" >&2 + printf ' user = %s\n' "${DBUSER}" >&2 + printf ' dbname = %s\n' "${DBNAME}" >&2 + printf ' psql = %s/bin/psql -h %s -p %s -U %s -d %s\n' \ + "${prefix}" "${SOCK_DIR}" "${port}" "${DBUSER}" "${DBNAME}" >&2 + done + printf '\n' >&2 +} + +# Same two SELECTs as print_subscription_state(), but the aligned psql +# output goes straight to the terminal (stderr) so the user sees the +# unmassaged final state of every subscription on every node. This is +# the "honest SELECT output" that lives next to the final RESULT line. +print_subscription_state_to_screen() { + local node ver + for node in ${NODES}; do + ver="$(node_to_ver "${node}")" + printf '\n=== %s (PG%s) ===\n' "${node}" "${ver}" >&2 + # Probe first with a cheap SELECT 1; if the server isn't + # reachable, say so cleanly instead of letting the two SELECTs + # below splat raw psql connection errors at the user. + if ! psql_on "${node}" -At -c 'SELECT 1' >/dev/null 2>&1; then + printf ' (not reachable -- server is stopped or socket is gone)\n' >&2 + continue + fi + psql_on "${node}" -P null='(null)' \ + -c "SELECT sub_name, sub_enabled + FROM spock.subscription + ORDER BY sub_name;" \ + -c "SELECT subscription_name, status, provider_node, slot_name + FROM spock.sub_show_status() + ORDER BY subscription_name;" \ + 1>&2 2>&1 || true + done + printf '\n' >&2 +} + +# --------------------------------------------------------------------------- +# End-to-end propagation check using spock.sync_event / wait_for_sync_event +# --------------------------------------------------------------------------- +# +# status='replicating' only says the apply worker is *connected*. It +# doesn't prove that bytes are actually moving end-to-end. The +# sync_event primitive is the canonical way to test that: +# +# 1. On the provider: SELECT spock.sync_event(); +# -- emits a marker into WAL, returns its LSN. +# +# 2. On the subscriber: CALL spock.wait_for_sync_event( +# NULL, '', '', ); +# -- blocks until that marker has been applied. +# Result is true on delivery, false on timeout. +# +# We fire one sync_event per provider (4 total) and fan out the wait to +# the other three nodes (12 directional checks). Any timeout means +# replication is not actually flowing for that pair -- FAIL. + +SYNC_EVENT_TIMEOUT=10 + +_wait_one_sync_event() { + local provider="$1" subscriber="$2" lsn="$3" + # Wrap the CALL in a DO block that raises on timeout, so psql's + # exit code distinguishes success (rc=0) from "did not arrive" + # (rc!=0). Avoids parsing the OUT parameter from CALL output. + psql_on "${subscriber}" -q -c " + DO \$check\$ + DECLARE + r bool; + BEGIN + CALL spock.wait_for_sync_event( + r, '${provider}'::name, '${lsn}'::pg_lsn, + ${SYNC_EVENT_TIMEOUT}); + IF NOT r THEN + RAISE EXCEPTION + 'sync_event from ${provider} did not arrive on ${subscriber} within ${SYNC_EVENT_TIMEOUT}s'; + END IF; + END + \$check\$; + " +} + +check_sync_event_propagation() { + local fail=0 provider subscriber lsn + local logf="${LOG_DIR}/sync-event-check.log" + : >"${logf}" + + for provider in ${NODES}; do + # Emit a sync event on the provider and capture its LSN. + lsn="$(psql_on "${provider}" -At \ + -c "SELECT spock.sync_event();" 2>/dev/null)" \ + || lsn= + if [ -z "${lsn}" ]; then + log "${provider}: spock.sync_event() emit failed" + fail=1 + continue + fi + log "${provider}: emitted sync_event @ ${lsn}" + + for subscriber in ${NODES}; do + [ "${subscriber}" = "${provider}" ] && continue + if _wait_one_sync_event "${provider}" "${subscriber}" "${lsn}" \ + >>"${logf}" 2>&1; then + log "${provider} -> ${subscriber}: sync_event delivered" + else + log "${provider} -> ${subscriber}: sync_event NOT delivered within ${SYNC_EVENT_TIMEOUT}s" + fail=1 + fi + done + done + return ${fail} +} + +# --------------------------------------------------------------------------- +# Verify every subscription is still enabled +# --------------------------------------------------------------------------- + +verify_subs_enabled() { + local node out any_bad=0 + for node in ${NODES}; do + # Suppress psql stderr -- on a dead server we don't want raw + # libpq complaints leaking to the terminal; the connection + # state is reported by print_subscription_state_to_screen. + out="$(psql_on "${node}" -At -c \ + "SELECT sub_name FROM spock.subscription WHERE NOT sub_enabled ORDER BY sub_name;" \ + 2>/dev/null)" \ + || { any_bad=1; log "${node}: NOT reachable -- treating as failure"; continue; } + if [ -n "${out}" ]; then + any_bad=1 + # Route the list of disabled subs into main.log -- no + # screen noise; the final RESULT line will direct the + # user to the right per-node log file. + log "${node}: DISABLED subscriptions: $(echo ${out} | tr '\n' ' ')" + else + log "${node}: all subscriptions still enabled" + fi + done + return "${any_bad}" +} + +# --------------------------------------------------------------------------- +# Failure diagnostics +# --------------------------------------------------------------------------- + +dump_logs_on_failure() { + # Quiet by design: per-phase logs already hold everything; just point + # the user at the directory. No stderr noise. + say "see ${LOG_DIR}/ for per-instance log files" +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +main() { + launch_builders + wait_for_builders || fail "one or more builders failed" 5 + + wait_for_all_ready || fail "one or more nodes never became ready" 6 + + local node + for node in ${NODES}; do create_db_for_node "${node}"; done + for node in ${NODES}; do setup_spock_node "${node}"; done + + wire_full_mesh + + # Wait for every sub to report status='replicating' instead of + # guessing with sleep. Same loop pattern as the regression test + # helper in tests/regress/sql/alter_options.sql. + local wait_rc=0 + wait_for_mesh_replicating "${WAIT_REPLICATING_TIMEOUT_PRE}" || wait_rc=$? + + # Snapshot the subscription state before hammering n1 with the + # regression suite -- useful for comparing against the post-run + # state if something regresses. + print_subscription_state "before installcheck" + + if [ "${wait_rc}" -ne 0 ]; then + say "WARNING: not all subscriptions reached 'replicating' before installcheck (see ${LOG_DIR}/.log)" + fi + + # Always run installcheck. Its own pass/fail is irrelevant to the + # script's exit code -- regression failures are expected stress and + # get logged for diagnostics only. + run_installcheck + + # Re-snapshot the state -- if installcheck-induced traffic disabled + # or de-synced a subscription, this is where we see it. + print_subscription_state "after installcheck" + + # Diagnostic: snapshot whether every sub is back to 'replicating'. + # This is just a snapshot -- apply workers can flicker through + # 'down' for a few seconds after installcheck while their + # restart_delay window elapses, so the result is logged but does + # NOT decide the test. sync_event below is what really matters. + local post_wait_rc=0 + wait_for_mesh_replicating "${WAIT_REPLICATING_TIMEOUT_POST}" \ + || post_wait_rc=$? + if [ "${post_wait_rc}" -ne 0 ]; then + log "diagnostic: status!='replicating' on some subs after ${WAIT_REPLICATING_TIMEOUT_POST}s (sync_event below is the authority)" + fi + + # Authoritative end-to-end check: sync_event proves bytes actually + # round-trip from each provider to every other node. Always run + # this, regardless of the status snapshot above -- a sub that + # briefly flickered 'down' but is now caught up will deliver + # sync_event just fine, and the snapshot's verdict is a false + # negative we should not honor. + local sync_rc=0 + check_sync_event_propagation || sync_rc=$? + + local verify_rc=0 + verify_subs_enabled || verify_rc=$? + + # Show the user the actual SELECT output for every subscription on + # every node BEFORE stopping the nodes (psql can't talk to a stopped + # server). + print_subscription_state_to_screen + + # And the connection params, so the user can attach if they like. + # Printed before stop_all_nodes so the info is valid right now in + # the --keep case. + print_connection_params + + if [ "${KEEP_RUNNING}" -eq 0 ]; then + stop_all_nodes + else + log "--keep set: leaving nodes running. Sockets under ${SOCK_DIR}" + fi + + # Two-pronged decision: + # (1) every sub_enabled = true (admin flag survived) + # (2) sync_event round-trips on every edge (bytes actually flow) + # The status='replicating' snapshot is diagnostic only -- it can be + # a false negative right after installcheck while apply workers + # work through their restart_delay window. + if [ "${verify_rc}" -ne 0 ] || [ "${sync_rc}" -ne 0 ]; then + local reason= + [ "${verify_rc}" -ne 0 ] \ + && reason="some subscriptions disabled" + [ "${sync_rc}" -ne 0 ] \ + && reason="${reason:+${reason}; }sync_event did not propagate on some edges (see ${LOG_DIR}/sync-event-check.log)" + log "RESULT: ${reason}" + say "RESULT: FAIL -- ${reason} (see output above)" + return 2 + fi + log "RESULT: every sub enabled, sync_event round-trips on every edge" + say "RESULT: PASS -- mesh healthy after installcheck (sub_enabled + sync_event)" + return 0 +} + +main "$@"