leducp · leducp · Jun 3, 2026
diff --git a/tests/README.md b/tests/README.md
@@ -84,9 +84,55 @@ endurance is worth more than a long Release soak for finding ordering bugs.
 ```bash
 scripts/configure.sh build_tsan --with=unit_tests --with=tsan
 scripts/setup_build.sh build_tsan && cmake --build build_tsan -j
-TSAN_OPTIONS="suppressions=$PWD/tests/tsan.supp" \
+TSAN_OPTIONS="suppressions=$PWD/tests/tsan.supp:halt_on_error=1" \
   tests/endurance.sh build_tsan/kickmsg_stress_test 14400
 ```
+`halt_on_error=1` makes TSAN stop at the first race with the report intact;
+without it TSAN reports and *continues*, and the run still exits cleanly. The
+endurance harness also greps each run for `ThreadSanitizer`/`runtime error:`
+and counts it as a failure (`san=` column), so a race is caught either way.
+
+Rungs 3-4 above are the **hands-on** soaks: pick one binary and a duration and
+hammer it (e.g. crash recovery for an hour before a PR, or TSAN stress for an
+afternoon). The rig below is the **unattended** counterpart -- it just cycles
+all of them for you. It is built *on top of* `endurance.sh`, not a replacement;
+keep using the direct form for targeted runs.
+
+### 5. Unattended long-horizon rig -- cycles all profiles
+`tests/soak_all.sh` loops weighted profiles until a wall-clock deadline, each a
+time-sliced `endurance.sh` pass: **TSAN takes half the slices** (rarest,
+highest-value signal), crash fuzz a quarter, plain stress the rest as periodic
+sanity at oversub 150/200. It records a per-slice verdict, persists failing
+runs, and survives any single slice failing. It self-detaches (survives
+logout), runs under `caffeinate` on macOS, and prints a watch/stop dashboard.
+```bash
+# build the plain + crash binaries (build/) and optionally build_tsan/
+tests/soak_all.sh 604800 1800            # 1 week, 30-min slices; returns at once
+# absent build_tsan/ is skipped, not fatal. SOAK_FOREGROUND=1 to run inline.
+```
+Each launch isolates its run under `soak_logs/run_<timestamp>/` (also
+`soak_logs/latest`) with the master log, per-slice logs, and `fails/`. Check
+progress at any time with:
+```bash
+tests/soak_status.sh                     # state, elapsed/remaining, per-profile tally
+```
+The launch dashboard also prints the exact `tail`/stop commands. Interrupting a
+run (Ctrl-C or `kill`) unlinks the active segment, so it leaves no stale
+`/dev/shm` behind (`kill -9` is the exception, covered by unlink-before-create).
+
+Slice length (arg 2) is operational, not a coverage knob: a profile's total
+iterations over the run depend only on its *share* of the cycle, not on how
+that time is chunked. Shorter slices just give finer failure-attribution
+checkpoints. To shift priority between profiles, edit the weighting in the
+`PROFILES` list -- don't lengthen slices.
+
+The **crash test fuzzes its kill timing** (seeded random per round) so a long
+soak explores new crash windows instead of re-hitting a fixed schedule:
+```bash
+build/kickmsg_crash_test                       # seed from clock -- logged at startup
+KICKMSG_CRASH_SEED=12345 build/kickmsg_crash_test   # replay an exact schedule
+KICKMSG_CRASH_ROUNDS=200 build/kickmsg_crash_test   # more kills per invocation
+```
 
 ## Contention scales to your machine
 
@@ -107,6 +153,8 @@ bound it explicitly.
 
 ## Detached long soak (survives logout; keeps the machine awake)
 
+For a *single-profile* `endurance.sh` run (rungs 3-4); the rig in rung 5
+self-detaches and handles `caffeinate` on its own.
 ```bash
 # macOS: caffeinate prevents idle sleep mid-soak
 nohup caffeinate -i tests/endurance.sh build/kickmsg_stress_test 43200 > soak.log 2>&1 &

diff --git a/tests/crash_test.cc b/tests/crash_test.cc
@@ -24,6 +24,7 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#include "shm_cleanup.h"
 #include "kickmsg/os/Time.h"
 #include "kickmsg/Publisher.h"
 #include "kickmsg/Subscriber.h"
@@ -45,6 +46,48 @@ static uint32_t compute_checksum(CrashPayload const& p)
     return p.magic ^ p.seq ^ 0xBAADF00D;
 }
 
+// --- Seeded kill-timing fuzzer ---------------------------------------------
+// Each kill fires at a random instant so a long soak explores new crash
+// windows instead of re-hitting a fixed schedule.  The seed is logged at
+// startup; set KICKMSG_CRASH_SEED to replay a specific run.
+namespace
+{
+    uint64_t g_rng_state = 0;
+
+    uint64_t next_rand() // splitmix64
+    {
+        g_rng_state += 0x9E3779B97F4A7C15ull;
+        uint64_t z = g_rng_state;
+        z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+        z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+        return z ^ (z >> 31);
+    }
+
+    // Sleep a random duration in [lo_us, hi_us] inclusive (microsecond grain).
+    void sleep_rand(uint64_t lo_us, uint64_t hi_us)
+    {
+        uint64_t span = hi_us - lo_us + 1;
+        kickmsg::sleep(microseconds{static_cast<int64_t>(lo_us + next_rand() % span)});
+    }
+
+    uint64_t seed_fuzzer()
+    {
+        uint64_t seed;
+        char const* env = std::getenv("KICKMSG_CRASH_SEED");
+        if (env != nullptr)
+        {
+            seed = std::strtoull(env, nullptr, 0);
+        }
+        else
+        {
+            seed = static_cast<uint64_t>(monotonic_ns().count())
+                 ^ (static_cast<uint64_t>(::getpid()) << 32);
+        }
+        g_rng_state = seed;
+        return seed;
+    }
+}
+
 /// Child publisher: publishes as fast as possible using allocate() + publish()
 /// to maximize the window where a kill can orphan a slot.
 static void child_publisher_main(int /*round*/)
@@ -128,6 +171,7 @@ struct RoundResult
     bool recovered_entries;
     bool recovered_rings;
     bool recovered_slots;
+    bool repair_ok;
     bool subscriber_ok;
 };
 
@@ -156,8 +200,8 @@ static RoundResult run_one_round(int round)
         _exit(0); // never reached
     }
 
-    // Let publisher run for 20-50ms
-    kickmsg::sleep(milliseconds{20 + (round % 30)});
+    // Kill at a fuzzed instant in the publisher's lifecycle (0.2-50ms).
+    sleep_rand(200, 50000);
 
     // Kill publisher mid-flight
     kill(pub_pid, SIGKILL);
@@ -186,7 +230,8 @@ static RoundResult run_one_round(int round)
 
     // Verify clean after repair
     auto post = region.diagnose();
-    if (post.locked_entries > 0 or post.retired_rings > 0)
+    result.repair_ok = (post.locked_entries == 0 and post.retired_rings == 0);
+    if (not result.repair_ok)
     {
         std::fprintf(stderr, "  [FAIL] Round %d: repair incomplete "
                      "(locked=%u, retired=%u)\n",
@@ -382,7 +427,7 @@ static bool test_multi_publisher_crash()
         }
     }
 
-    kickmsg::sleep(30ms);
+    sleep_rand(2000, 50000);
 
     for (int i = 0; i < N_PUBS; ++i)
     {
@@ -452,7 +497,17 @@ static bool test_multi_publisher_crash()
 
 int main()
 {
-    std::printf("=== Kickmsg Multi-Process Crash Test ===\n\n");
+    std::printf("=== Kickmsg Multi-Process Crash Test ===\n");
+    // Clean up segments if interrupted (Ctrl-C / kill) before the test's own
+    // unlink runs.  Installed before forking so children inherit it too.
+    kickmsg_test::register_cleanup_shm("/kickmsg_crash_test");
+    kickmsg_test::register_cleanup_shm("/kickmsg_crash_test_sub");
+    kickmsg_test::register_cleanup_shm("/kickmsg_crash_test_multi");
+    kickmsg_test::install_signal_cleanup();
+
+    uint64_t const seed = seed_fuzzer();
+    std::printf("crash fuzz seed=%llu (set KICKMSG_CRASH_SEED to replay)\n\n",
+                static_cast<unsigned long long>(seed));
 
     kickmsg::SharedMemory::unlink(SHM_NAME);
 
@@ -490,7 +545,15 @@ int main()
     // Let subscriber attach
     kickmsg::sleep(50ms);
 
-    constexpr int NUM_ROUNDS = 10;
+    int NUM_ROUNDS = 30;
+    if (char const* r = std::getenv("KICKMSG_CRASH_ROUNDS"))
+    {
+        int v = std::atoi(r);
+        if (v > 0)
+        {
+            NUM_ROUNDS = v;
+        }
+    }
     int any_recovery = 0;
     bool all_ok = true;
 
@@ -501,6 +564,10 @@ int main()
         {
             ++any_recovery;
         }
+        if (not result.repair_ok)
+        {
+            all_ok = false;
+        }
     }
 
     // Signal subscriber to exit

diff --git a/tests/endurance.sh b/tests/endurance.sh
@@ -38,8 +38,10 @@ while [ "$(date +%s)" -lt "$END_TIME" ]; do
     fi
     SUMMARY=$(echo "$OUTPUT" | grep "Summary:" | tail -1 || true)
     if [ -n "$SUMMARY" ]; then
-        RUN_PASS=$(echo "$SUMMARY" | grep -oE '[0-9]+ passed' | grep -oE '[0-9]+')
-        RUN_FAIL=$(echo "$SUMMARY" | grep -oE '[0-9]+ failed' | grep -oE '[0-9]+')
+        # Guarded: a garbled/interleaved Summary line (possible under heavy
+        # sanitizer contention) must not let set -e kill the whole soak.
+        RUN_PASS=$(echo "$SUMMARY" | grep -oE '[0-9]+ passed' | grep -oE '[0-9]+' || true)
+        RUN_FAIL=$(echo "$SUMMARY" | grep -oE '[0-9]+ failed' | grep -oE '[0-9]+' || true)
     else
         # No summary line (e.g. crash test): tally by exit code.
         if [ "$RC" -eq 0 ]; then
@@ -53,15 +55,25 @@ while [ "$(date +%s)" -lt "$END_TIME" ]; do
     RUN_PASS=${RUN_PASS:-0}
     RUN_FAIL=${RUN_FAIL:-0}
     RUN_REORDER=$(echo "$OUTPUT" | { grep -c "REORDER" || true; })
+    # Sanitizer reports (TSAN/ASAN/UBSAN) go to stderr and do NOT bump the
+    # suite's "failed" count -- detect them explicitly or they get swallowed.
+    RUN_SANITIZER=$(echo "$OUTPUT" | { grep -c -E "ThreadSanitizer|AddressSanitizer|runtime error:" || true; })
+    if [ "$RUN_SANITIZER" -gt 0 ] && [ "$RUN_FAIL" -eq 0 ]; then
+        RUN_FAIL=1
+    fi
     PASS=$((PASS + RUN_PASS))
     FAIL=$((FAIL + RUN_FAIL))
     REORDERS=$((REORDERS + RUN_REORDER))
     ELAPSED=$(($(date +%s) - END_TIME + DURATION_SECS))
-    printf "\r[%ds/%ds] runs=%d pass=%d fail=%d reorders=%d" \
-           "$ELAPSED" "$DURATION_SECS" "$RUNS" "$PASS" "$FAIL" "$REORDERS"
-    if [ "$RUN_FAIL" -gt 0 ]; then
+    printf "\r[%ds/%ds] runs=%d pass=%d fail=%d reorders=%d san=%d" \
+           "$ELAPSED" "$DURATION_SECS" "$RUNS" "$PASS" "$FAIL" "$REORDERS" "$RUN_SANITIZER"
+    if [ "$RUN_FAIL" -gt 0 ] || [ "$RC" -ne 0 ]; then
+        # Persist the full run output -- the evidence is otherwise lost.
+        FAILDIR="${FAILDIR:-endurance_fails}"
+        mkdir -p "$FAILDIR"
+        printf '%s\n' "$OUTPUT" > "$FAILDIR/run_${RUNS}_rc${RC}.log"
         echo ""
-        echo "$OUTPUT" | grep -E "REORDER|FAIL|WARN" || true
+        echo "$OUTPUT" | grep -E "REORDER|FAIL|WARN|ThreadSanitizer|runtime error:" || true
     fi
 done
 echo ""

diff --git a/tests/shm_cleanup.h b/tests/shm_cleanup.h
@@ -0,0 +1,61 @@
+#ifndef KICKMSG_TESTS_SHM_CLEANUP_H
+#define KICKMSG_TESTS_SHM_CLEANUP_H
+
+// Best-effort shm cleanup for the test binaries: on SIGINT/SIGTERM, unlink the
+// registered segments so an interrupted run leaves no stale /dev/shm entry for
+// the next run to trip over.  Names must be string literals (static storage);
+// the handler only calls shm_unlink + _exit, both async-signal-safe.
+// SIGKILL (-9) cannot be caught -- those leftovers are handled by each
+// scenario's unlink-before-create instead.
+
+#ifndef _WIN32
+#include <csignal>
+#include <cstring>
+#include <sys/mman.h>
+#include <unistd.h>
+
+namespace kickmsg_test
+{
+    inline constexpr int MAX_CLEANUP = 32;
+    // volatile: the elements are read from a signal handler.
+    inline char const* volatile g_cleanup_names[MAX_CLEANUP] = {};
+    inline volatile sig_atomic_t g_cleanup_count = 0;
+
+    inline void shm_cleanup_handler(int sig)
+    {
+        for (sig_atomic_t i = 0; i < g_cleanup_count; ++i)
+        {
+            ::shm_unlink(g_cleanup_names[i]);
+        }
+        ::_exit(128 + sig);
+    }
+
+    inline void register_cleanup_shm(char const* name)
+    {
+        if (g_cleanup_count < MAX_CLEANUP)
+        {
+            g_cleanup_names[g_cleanup_count] = name;
+            g_cleanup_count = g_cleanup_count + 1;
+        }
+    }
+
+    inline void install_signal_cleanup()
+    {
+        struct sigaction sa;
+        std::memset(&sa, 0, sizeof(sa));
+        sa.sa_handler = shm_cleanup_handler;
+        sigemptyset(&sa.sa_mask);
+        sa.sa_flags = 0;
+        ::sigaction(SIGINT, &sa, nullptr);
+        ::sigaction(SIGTERM, &sa, nullptr);
+    }
+}
+#else
+namespace kickmsg_test
+{
+    inline void register_cleanup_shm(char const*) {}
+    inline void install_signal_cleanup() {}
+}
+#endif
+
+#endif