diff --git a/.gitignore b/.gitignore
index be008244..8279b2bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,4 +73,8 @@ htmlcov/
 
 # Ad-hoc plan/spec docs (kept out of repo to avoid accidental commits)
 docs/plans/
+docs/specs/
 docs/superpowers/
+
+# Internal benchmark scratch configs/logs/results
+.archie-bench/
diff --git a/CLAUDE.md b/CLAUDE.md
index 7a9bc7c6..2cda86dc 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -52,6 +52,18 @@ npx @bitraptors/archie /path/to/project
 python -m pytest tests/ -v
 ```
 
+### Benchmark Harness (internal)
+```bash
+# Measure Archie effectiveness: same task, control (no Archie) vs treatment (full Archie)
+python3 -m archie.benchmark auto /path/to/repo --prompt "..."   # prep + run from a plain repo
+python3 -m archie.benchmark run config.json                     # run on existing branches
+```
+Internal-only (not shipped via npm). Captures tool calls / tokens / cost / time +
+blind judge-Claude quality, writes to Supabase (`benchmark_runs`, `benchmark_samples`).
+Before benchmarking, copy `archie/benchmark/secrets.env.example` → `.archie-bench/secrets.env`
+and fill in the Supabase URL + service_role key (else runs fall back to offline mode).
+See `archie/benchmark/README.md`.
+
 ## Command Architecture
 
 - **`/archie-deep-scan`** — Comprehensive baseline (15-20 min). Full 2-wave AI analysis producing blueprint, per-folder CLAUDE.md, rules, and health metrics. Rerun to refresh the baseline; each run builds on prior findings.
diff --git a/archie/benchmark/README.md b/archie/benchmark/README.md
new file mode 100644
index 00000000..0ec7f187
--- /dev/null
+++ b/archie/benchmark/README.md
@@ -0,0 +1,74 @@
+# Archie Benchmark Harness (internal)
+
+Measures Archie's effectiveness: runs the **same** task headlessly on a control
+branch (no Archie) and a treatment branch (full Archie docs + hooks), capturing
+tool calls / tokens / cost / time + a blind judge-Claude quality score, and writes
+results to Supabase. **Not** shipped via npm.
+
+## Setup — do this before your first benchmark
+
+Provide Supabase credentials so results are stored (see [Supabase](#supabase) for
+detail). In short:
+
+```bash
+cp archie/benchmark/secrets.env.example .archie-bench/secrets.env
+# edit .archie-bench/secrets.env: real SUPABASE_URL + service_role key
+set -a; source .archie-bench/secrets.env; set +a
+```
+
+This must be filled in **before** you benchmark if you want results in Supabase. If
+you skip it, runs still work but fall back to **offline mode** (a local
+`results.json`), and nothing is written to the database.
+
+## Usage
+
+```bash
+# 1. Author a config (see example below) — JSON, zero-dep.
+# 2. From a plain repo, prep branches then run:
+python3 -m archie.benchmark auto /path/to/repo --prompt "Add a sleep timer feature"
+
+# Or with a config file:
+python3 -m archie.benchmark run config.json     # branches must already exist
+python3 -m archie.benchmark prep config.json    # only create/refresh branches
+```
+
+If the repo has no Archie files yet, `auto`/`prep` create the branches, then pause
+so you can run `/archie-deep-scan` interactively on the treatment branch. That
+deep-scan is **never** counted in the measured metrics.
+
+## Config
+
+```json
+{
+  "name": "bedtime-add-sleep-timer",
+  "repo": "/Users/you/DEV/BedtimeApp",
+  "task_prompt": "Add a sleep timer feature ...",
+  "model": "claude-sonnet-4-6",
+  "repetitions": 3,
+  "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"},
+  "judge": {"model": "claude-opus-4-8", "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"]},
+  "timeout_seconds": 3600
+}
+```
+
+## Supabase
+
+Copy the credentials template and fill it in (the copy lives in gitignored
+`.archie-bench/`, so real keys are never committed):
+
+```bash
+cp archie/benchmark/secrets.env.example .archie-bench/secrets.env
+# edit .archie-bench/secrets.env: real SUPABASE_URL + service_role key
+set -a; source .archie-bench/secrets.env; set +a
+```
+
+`store.py` reads `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` from the environment.
+Without them the harness writes `.archie/benchmark/<name>/results.json` locally
+(offline mode). Use the **service_role** key (not anon) so inserts bypass RLS, and
+apply `archie/benchmark/schema.sql` in the Supabase SQL Editor once.
+
+## Fairness invariants
+
+- Identical `task_prompt`, `model`, and harness flags on both arms.
+- Both branches descend from the same base commit (enforced).
+- Deep-scan prep cost is separate (`prep_cost_usd`), never in sample metrics.
diff --git a/archie/benchmark/__init__.py b/archie/benchmark/__init__.py
new file mode 100644
index 00000000..e5cca8b8
--- /dev/null
+++ b/archie/benchmark/__init__.py
@@ -0,0 +1 @@
+"""Internal Archie effectiveness benchmark harness (not shipped via npm)."""
diff --git a/archie/benchmark/__main__.py b/archie/benchmark/__main__.py
new file mode 100644
index 00000000..9ae637f1
--- /dev/null
+++ b/archie/benchmark/__main__.py
@@ -0,0 +1,4 @@
+from .cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/archie/benchmark/aggregate.py b/archie/benchmark/aggregate.py
new file mode 100644
index 00000000..11186647
--- /dev/null
+++ b/archie/benchmark/aggregate.py
@@ -0,0 +1,47 @@
+NUMERIC_FIELDS = ["cost_usd", "tool_calls", "duration_ms", "input_tokens", "output_tokens"]
+
+
+def _mean(values):
+    return sum(values) / len(values) if values else None
+
+
+def _arm_stats(samples):
+    # A sample "attempted" the task if it produced a non-empty diff. Legacy
+    # samples without the flag are treated as attempted (back-compat).
+    stats = {
+        "n": len(samples),
+        "completed_n": sum(1 for s in samples if s.get("completed")),
+        "attempted_n": sum(1 for s in samples if s.get("attempted", True)),
+    }
+    for f in NUMERIC_FIELDS:
+        vals = [s[f] for s in samples if s.get(f) is not None]
+        stats[f + "_mean"] = _mean(vals)
+    # Quality only counts attempts: an empty-diff run that the judge scored low
+    # is "not attempted", not "poor quality" — exclude it from the mean.
+    qvals = [s["quality_score"] for s in samples
+             if s.get("attempted", True) and s.get("quality_score") is not None]
+    stats["quality_mean"] = _mean(qvals)
+    return stats
+
+
+def _pct_lower(treatment, control):
+    """Percent reduction of treatment relative to control (positive = treatment cheaper)."""
+    if treatment is None or control is None or control == 0:
+        return None
+    return round((control - treatment) / control * 100, 1)
+
+
+def aggregate_samples(samples):
+    treatment = [s for s in samples if s.get("arm") == "treatment"]
+    control = [s for s in samples if s.get("arm") == "control"]
+    t_stats = _arm_stats(treatment)
+    c_stats = _arm_stats(control)
+    return {
+        "treatment": t_stats,
+        "control": c_stats,
+        "savings": {
+            "cost_pct": _pct_lower(t_stats["cost_usd_mean"], c_stats["cost_usd_mean"]),
+            "tool_calls_pct": _pct_lower(t_stats["tool_calls_mean"], c_stats["tool_calls_mean"]),
+            "duration_pct": _pct_lower(t_stats["duration_ms_mean"], c_stats["duration_ms_mean"]),
+        },
+    }
diff --git a/archie/benchmark/cli.py b/archie/benchmark/cli.py
new file mode 100644
index 00000000..f9d6be6f
--- /dev/null
+++ b/archie/benchmark/cli.py
@@ -0,0 +1,102 @@
+# archie/benchmark/cli.py
+import argparse
+import sys
+from pathlib import Path
+
+from .config import load_config, parse_config
+from .orchestrator import run_benchmark, prepare_branches
+
+
+def _print_summary(result):
+    agg = result["aggregate"]
+    print("\n=== Benchmark summary ===")
+    for arm in ("treatment", "control"):
+        a = agg[arm]
+        print(f"[{arm}] n={a['n']} attempted={a['attempted_n']} completed={a['completed_n']} "
+              f"cost=${_fmt(a['cost_usd_mean'])} tools={_fmt(a['tool_calls_mean'])} "
+              f"dur={_fmt(a['duration_ms_mean'])}ms quality={_fmt(a['quality_mean'])}")
+    s = agg["savings"]
+    print(f"[savings] cost={_fmt(s['cost_pct'])}%  tools={_fmt(s['tool_calls_pct'])}%  "
+          f"time={_fmt(s['duration_pct'])}%")
+    print(f"[store] {result['store']}")
+
+
+def _fmt(v):
+    return "n/a" if v is None else (f"{v:.2f}" if isinstance(v, float) else str(v))
+
+
+def _cmd_run(args):
+    cfg = load_config(args.config)
+    result = run_benchmark(cfg)
+    _print_summary(result)
+
+
+def _cmd_prep(args):
+    cfg = load_config(args.config)
+    status = prepare_branches(cfg)
+    if status["needs_deep_scan"]:
+        _interactive_deep_scan(cfg)
+    print(f"Branches ready: {cfg.branches}")
+
+
+def _cmd_auto(args):
+    if args.config:
+        cfg = load_config(args.config)
+    else:
+        cfg = parse_config({"name": Path(args.repo).name, "repo": args.repo,
+                            "task_prompt": args.prompt, "model": args.model})
+    status = prepare_branches(cfg)
+    if status["needs_deep_scan"]:
+        _interactive_deep_scan(cfg)
+    result = run_benchmark(cfg)
+    _print_summary(result)
+
+
+def _interactive_deep_scan(cfg):
+    treatment = cfg.branches["treatment"]
+    print("\n" + "=" * 70)
+    print("Archie not found in this repo. Semi-automatic prep:")
+    print(f"  1. In a terminal: git checkout {treatment}")
+    print(f"  2. Install Archie:  npx @bitraptors/archie {cfg.repo}")
+    print("  3. In Claude Code on that branch, run:  /archie-deep-scan")
+    print("  4. Commit the generated files.")
+    print("This deep-scan is NOT counted in the benchmark metrics.")
+    print("=" * 70)
+    input("Press Enter once the treatment branch has committed Archie files... ")
+    # verify
+    from .orchestrator import _git_out, _archie_present  # local import to avoid cycle noise
+    current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], cfg.repo)
+    _git_out(["checkout", treatment], cfg.repo)
+    present = _archie_present(cfg.repo)
+    _git_out(["checkout", current], cfg.repo)
+    if not present:
+        print("ERROR: no Archie files found on the treatment branch. Aborting.", file=sys.stderr)
+        sys.exit(1)
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(prog="archie-benchmark",
+                                     description="Measure Archie effectiveness (control vs treatment).")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_run = sub.add_parser("run", help="run benchmark on existing branches")
+    p_run.add_argument("config", help="path to benchmark config JSON")
+    p_run.set_defaults(func=_cmd_run)
+
+    p_prep = sub.add_parser("prep", help="create/refresh benchmark branches only")
+    p_prep.add_argument("config", help="path to benchmark config JSON")
+    p_prep.set_defaults(func=_cmd_prep)
+
+    p_auto = sub.add_parser("auto", help="prep branches then run, from a plain repo")
+    p_auto.add_argument("repo", nargs="?", help="repo path (when no --config)")
+    p_auto.add_argument("--config", help="path to benchmark config JSON")
+    p_auto.add_argument("--prompt", help="task prompt (when no --config)")
+    p_auto.add_argument("--model", default="claude-sonnet-4-6")
+    p_auto.set_defaults(func=_cmd_auto)
+
+    args = parser.parse_args(argv)
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/archie/benchmark/config.py b/archie/benchmark/config.py
new file mode 100644
index 00000000..3cdff169
--- /dev/null
+++ b/archie/benchmark/config.py
@@ -0,0 +1,64 @@
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+DEFAULT_JUDGE_MODEL = "claude-opus-4-8"
+DEFAULT_RUBRIC = ["correctness", "completeness", "follows_conventions", "no_regressions"]
+DEFAULT_BRANCHES = {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}
+DEFAULT_TIMEOUT = 3600
+DEFAULT_REPETITIONS = 3
+REQUIRED = ("name", "repo", "task_prompt", "model")
+
+
+@dataclass
+class JudgeConfig:
+    model: str = DEFAULT_JUDGE_MODEL
+    rubric: list = field(default_factory=lambda: list(DEFAULT_RUBRIC))
+
+
+@dataclass
+class BenchmarkConfig:
+    name: str
+    repo: Path
+    task_prompt: str
+    model: str
+    branches: dict = field(default_factory=lambda: dict(DEFAULT_BRANCHES))
+    repetitions: int = DEFAULT_REPETITIONS
+    judge: JudgeConfig = field(default_factory=JudgeConfig)
+    timeout_seconds: int = DEFAULT_TIMEOUT
+
+
+def parse_config(data):
+    missing = [k for k in REQUIRED if k not in data or data[k] in (None, "")]
+    if missing:
+        raise ValueError(f"config missing required fields: {', '.join(missing)}")
+
+    branches = data.get("branches", dict(DEFAULT_BRANCHES))
+    for arm in ("treatment", "control"):
+        if arm not in branches or not branches[arm]:
+            raise ValueError(f"config.branches missing '{arm}'")
+
+    reps = int(data.get("repetitions", DEFAULT_REPETITIONS))
+    if reps < 1:
+        raise ValueError("repetitions must be >= 1")
+
+    jd = data.get("judge", {}) or {}
+    judge = JudgeConfig(
+        model=jd.get("model") or DEFAULT_JUDGE_MODEL,
+        rubric=jd.get("rubric") or list(DEFAULT_RUBRIC),
+    )
+
+    return BenchmarkConfig(
+        name=data["name"],
+        repo=Path(data["repo"]).expanduser(),
+        task_prompt=data["task_prompt"],
+        model=data["model"],
+        branches={"treatment": branches["treatment"], "control": branches["control"]},
+        repetitions=reps,
+        judge=judge,
+        timeout_seconds=int(data.get("timeout_seconds", DEFAULT_TIMEOUT)),
+    )
+
+
+def load_config(path):
+    return parse_config(json.loads(Path(path).read_text()))
diff --git a/archie/benchmark/diff.py b/archie/benchmark/diff.py
new file mode 100644
index 00000000..03bf1619
--- /dev/null
+++ b/archie/benchmark/diff.py
@@ -0,0 +1,29 @@
+import subprocess
+
+# Build/cache artifacts that are never meaningful for code review. These are
+# excluded from the captured diff so the judge scores only real source changes,
+# even when the target repo lacks a .gitignore for them (the agent may create
+# them as a side effect of running tests). Patterns use git pathspec wildcards
+# (`*` matches across `/`), so `*X*` catches X at any depth.
+_NOISE_GLOBS = [
+    "*__pycache__*",
+    "*.pyc",
+    "*.pyo",
+    "*.DS_Store",
+    "*node_modules*",
+    "*.pytest_cache*",
+    "*.mypy_cache*",
+    "*.ruff_cache*",
+]
+
+
+def capture_diff(worktree_path):
+    """Stage everything (so untracked files show) and return the cached diff text,
+    excluding universal build/cache artifacts (see _NOISE_GLOBS)."""
+    subprocess.run(["git", "add", "-A"], cwd=str(worktree_path),
+                   check=True, capture_output=True, text=True)
+    excludes = [f":(exclude){glob}" for glob in _NOISE_GLOBS]
+    result = subprocess.run(
+        ["git", "diff", "--cached", "--", ".", *excludes],
+        cwd=str(worktree_path), check=True, capture_output=True, text=True)
+    return result.stdout
diff --git a/archie/benchmark/isolation.py b/archie/benchmark/isolation.py
new file mode 100644
index 00000000..aaea5d94
--- /dev/null
+++ b/archie/benchmark/isolation.py
@@ -0,0 +1,22 @@
+# archie/benchmark/isolation.py
+import subprocess
+from contextlib import contextmanager
+from pathlib import Path
+
+
+@contextmanager
+def worktree(repo_path, branch, dest):
+    """Create a git worktree for `branch` at `dest`, always removed on exit."""
+    dest = Path(dest)
+    subprocess.run(["git", "worktree", "add", "--force", str(dest), branch],
+                   cwd=str(repo_path), check=True, capture_output=True, text=True)
+    try:
+        yield dest
+    finally:
+        subprocess.run(["git", "worktree", "remove", "--force", str(dest)],
+                       cwd=str(repo_path), capture_output=True, text=True)
+
+
+def prune(repo_path):
+    subprocess.run(["git", "worktree", "prune"],
+                   cwd=str(repo_path), capture_output=True, text=True)
diff --git a/archie/benchmark/judge.py b/archie/benchmark/judge.py
new file mode 100644
index 00000000..210ed07d
--- /dev/null
+++ b/archie/benchmark/judge.py
@@ -0,0 +1,68 @@
+# archie/benchmark/judge.py
+import json
+import subprocess
+
+
+def assign_order(seed):
+    """Return (treatment_variant, control_variant) — blind A/B label assignment."""
+    return ("a", "b") if seed % 2 == 0 else ("b", "a")
+
+
+def build_judge_prompt(task_prompt, diff_a, diff_b, rubric):
+    axes = ", ".join(rubric)
+    schema = ('{"variant_a": {' + ", ".join(f'"{a}": int' for a in rubric)
+              + ', "overall": number, "justification": string}, "variant_b": {... same keys ...}}')
+    return (
+        "You are an impartial senior code reviewer. Two AI agents independently "
+        "attempted the SAME task. You are shown each agent's diff as an anonymous "
+        "variant. Judge purely on the code; you do not know anything about how each "
+        "was produced.\n\n"
+        f"TASK GIVEN TO BOTH AGENTS:\n{task_prompt}\n\n"
+        f"Score each variant on these axes (each 1-10): {axes}. Also give an "
+        "'overall' score (0-10) and a one-sentence 'justification'.\n\n"
+        f"Respond with ONLY a JSON object of this exact shape:\n{schema}\n\n"
+        f"=== VARIANT A DIFF ===\n{diff_a}\n\n"
+        f"=== VARIANT B DIFF ===\n{diff_b}\n"
+    )
+
+
+def parse_judge_output(text):
+    start = text.find("{")
+    end = text.rfind("}")
+    if start == -1 or end == -1 or end < start:
+        raise ValueError("no JSON object found in judge output")
+    return json.loads(text[start:end + 1])
+
+
+def _default_runner(prompt, model, timeout):
+    proc = subprocess.run(
+        ["claude", "-p", prompt, "--model", model, "--output-format", "text"],
+        capture_output=True, text=True, timeout=timeout,
+    )
+    return proc.stdout
+
+
+def run_judge(task_prompt, treatment_diff, control_diff, rubric, model, seed,
+              timeout_seconds=600, _runner=None):
+    t_variant, c_variant = assign_order(seed)
+    diff_a = treatment_diff if t_variant == "a" else control_diff
+    diff_b = treatment_diff if t_variant == "b" else control_diff
+    prompt = build_judge_prompt(task_prompt, diff_a, diff_b, rubric)
+
+    runner = _runner or _default_runner
+    parsed = None
+    last_err = None
+    for _ in range(2):
+        try:
+            parsed = parse_judge_output(runner(prompt, model, timeout_seconds))
+            break
+        except (ValueError, json.JSONDecodeError) as e:
+            last_err = e
+    if parsed is None:
+        raise ValueError(f"judge returned unparseable output twice: {last_err}")
+
+    return {
+        "treatment": parsed["variant_a"] if t_variant == "a" else parsed["variant_b"],
+        "control": parsed["variant_a"] if c_variant == "a" else parsed["variant_b"],
+        "seed": seed,
+    }
diff --git a/archie/benchmark/metrics.py b/archie/benchmark/metrics.py
new file mode 100644
index 00000000..3f9d6121
--- /dev/null
+++ b/archie/benchmark/metrics.py
@@ -0,0 +1,47 @@
+# archie/benchmark/metrics.py
+import json
+from dataclasses import dataclass, field
+
+
+@dataclass
+class SampleMetrics:
+    tool_calls: int = 0
+    tool_breakdown: dict = field(default_factory=dict)
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_read_tokens: int = 0
+    cache_creation_tokens: int = 0
+    cost_usd: float = 0.0
+    duration_ms: int = 0
+    num_turns: int = 0
+    completed: bool = False
+
+
+def parse_stream(lines):
+    m = SampleMetrics()
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            ev = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        etype = ev.get("type")
+        if etype == "assistant":
+            for block in ev.get("message", {}).get("content", []) or []:
+                if isinstance(block, dict) and block.get("type") == "tool_use":
+                    m.tool_calls += 1
+                    name = block.get("name", "unknown")
+                    m.tool_breakdown[name] = m.tool_breakdown.get(name, 0) + 1
+        elif etype == "result":
+            usage = ev.get("usage", {}) or {}
+            m.input_tokens = usage.get("input_tokens", 0)
+            m.output_tokens = usage.get("output_tokens", 0)
+            m.cache_read_tokens = usage.get("cache_read_input_tokens", 0)
+            m.cache_creation_tokens = usage.get("cache_creation_input_tokens", 0)
+            m.cost_usd = ev.get("total_cost_usd", 0.0)
+            m.duration_ms = ev.get("duration_ms", 0)
+            m.num_turns = ev.get("num_turns", 0)
+            m.completed = ev.get("subtype") == "success"
+    return m
diff --git a/archie/benchmark/orchestrator.py b/archie/benchmark/orchestrator.py
new file mode 100644
index 00000000..7a2285b6
--- /dev/null
+++ b/archie/benchmark/orchestrator.py
@@ -0,0 +1,224 @@
+# archie/benchmark/orchestrator.py
+import subprocess
+import hashlib
+from pathlib import Path
+
+from .isolation import worktree, prune
+from .diff import capture_diff
+from .runner import run_claude
+from .judge import run_judge
+from .store import store_results
+from .aggregate import aggregate_samples
+from .metrics import SampleMetrics
+
+
+def _git_out(args, cwd):
+    return subprocess.run(["git", *args], cwd=str(cwd), check=True,
+                          capture_output=True, text=True).stdout.strip()
+
+
+def _base_commit(repo):
+    return _git_out(["rev-parse", "HEAD"], repo)
+
+
+def _merge_base(repo, a, b):
+    """The common-ancestor commit of two branches.
+
+    Used to verify both arms descend from the same base. We compare the
+    merge-base (not branch tips): prep intentionally adds a strip/deep-scan
+    commit to an arm, so the tips legitimately differ while the shared base
+    does not.
+    """
+    return _git_out(["merge-base", a, b], repo)
+
+
+def _seed(name, repetition):
+    h = hashlib.sha256(f"{name}:{repetition}".encode("utf-8")).hexdigest()
+    return int(h[:8], 16)
+
+
+def _worktrees_root(repo):
+    root = Path(repo) / ".archie" / "benchmark" / "worktrees"
+    root.mkdir(parents=True, exist_ok=True)
+    return root
+
+
+def _run_one(cfg, branch, repetition, run_fn, diff_fn):
+    """Run a single (branch, repetition) sample; return (metrics, diff)."""
+    root = _worktrees_root(cfg.repo)
+    dest = root / f"{branch.replace('/', '_')}-{repetition}"
+    with worktree(cfg.repo, branch, dest) as wt:
+        try:
+            metrics, _raw = run_fn(cfg.task_prompt, cfg.model, wt, cfg.timeout_seconds)
+        except Exception:
+            return SampleMetrics(completed=False), ""
+        diff = diff_fn(wt)
+        return metrics, diff
+
+
+def _sample_row(arm, repetition, metrics, quality_score, quality_detail, seed, attempted):
+    return {
+        "arm": arm,
+        "repetition": repetition,
+        "tool_calls": metrics.tool_calls,
+        "tool_breakdown": metrics.tool_breakdown,
+        "input_tokens": metrics.input_tokens,
+        "output_tokens": metrics.output_tokens,
+        "cache_read_tokens": metrics.cache_read_tokens,
+        "cache_creation_tokens": metrics.cache_creation_tokens,
+        "cost_usd": metrics.cost_usd,
+        "duration_ms": metrics.duration_ms,
+        "num_turns": metrics.num_turns,
+        "completed": metrics.completed,
+        "attempted": attempted,
+        "quality_score": quality_score,
+        "quality_detail": quality_detail,
+        "judge_seed": seed,
+    }
+
+
+def run_benchmark(cfg, run_fn=run_claude, judge_fn=run_judge,
+                  store_fn=store_results, diff_fn=capture_diff):
+    # Fairness guard: both arms must descend from the same base commit. We
+    # compare the merge-base, not branch tips — prep adds a strip/deep-scan
+    # commit to an arm, so tips differ while the common base must not.
+    try:
+        base = _merge_base(cfg.repo, cfg.branches["treatment"], cfg.branches["control"])
+    except subprocess.CalledProcessError:
+        base = ""
+    if not base:
+        raise ValueError(
+            "benchmark arms have no common ancestor base commit; both branches "
+            "must branch from the same commit")
+
+    prune(cfg.repo)
+    samples = []
+    for rep in range(cfg.repetitions):
+        t_metrics, t_diff = _run_one(cfg, cfg.branches["treatment"], rep, run_fn, diff_fn)
+        c_metrics, c_diff = _run_one(cfg, cfg.branches["control"], rep, run_fn, diff_fn)
+
+        # "Attempted" = the agent actually produced a code change. An empty
+        # diff means the task was not attempted, regardless of how the judge
+        # scores it — tracked so it can be excluded from quality means.
+        t_attempted = bool((t_diff or "").strip())
+        c_attempted = bool((c_diff or "").strip())
+
+        seed = _seed(cfg.name, rep)
+        try:
+            verdict = judge_fn(cfg.task_prompt, t_diff, c_diff, cfg.judge.rubric,
+                               cfg.judge.model, seed)
+            t_q = verdict["treatment"]
+            c_q = verdict["control"]
+        except Exception:
+            # A judge failure must not discard the (expensive) completed runs;
+            # record the samples without a quality score instead of aborting.
+            t_q = c_q = None
+        samples.append(_sample_row("treatment", rep, t_metrics,
+                                    t_q.get("overall") if t_q else None, t_q, seed,
+                                    t_attempted))
+        samples.append(_sample_row("control", rep, c_metrics,
+                                    c_q.get("overall") if c_q else None, c_q, seed,
+                                    c_attempted))
+    prune(cfg.repo)
+
+    agg = aggregate_samples(samples)
+    run_row = {
+        "name": cfg.name,
+        "repo_name": Path(cfg.repo).name,
+        "task_prompt": cfg.task_prompt,
+        "model": cfg.model,
+        "judge_model": cfg.judge.model,
+        "repetitions": cfg.repetitions,
+        "git_base_commit": base,
+        "prep_cost_usd": None,
+        "archie_version": _archie_version(),
+    }
+    offline_path = Path(cfg.repo) / ".archie" / "benchmark" / cfg.name / "results.json"
+    store_result = store_fn(run_row, samples, offline_path)
+    return {"aggregate": agg, "samples": samples, "store": store_result, "run": run_row}
+
+
+def _archie_version():
+    try:
+        from archie import __version__
+        return __version__
+    except Exception:
+        return "unknown"
+
+
+ARCHIE_PATHS = ["CLAUDE.md", "AGENTS.md", ".claude", ".archie"]
+
+
+def _is_clean(repo):
+    out = _git_out(["status", "--porcelain"], repo)
+    return out == ""
+
+
+def _archie_present(repo):
+    return any((Path(repo) / p).exists() for p in ARCHIE_PATHS)
+
+
+def _branch_exists(repo, branch):
+    res = subprocess.run(["git", "rev-parse", "--verify", branch],
+                         cwd=str(repo), capture_output=True, text=True)
+    return res.returncode == 0
+
+
+def _create_branch(repo, branch, base):
+    if _branch_exists(repo, branch):
+        subprocess.run(["git", "branch", "-D", branch], cwd=str(repo),
+                       capture_output=True, text=True)
+    _git_out(["branch", branch, base], repo)
+
+
+def _strip_archie_on_branch(repo, branch):
+    """Check out branch, remove Archie artifacts (incl. per-folder CLAUDE.md), commit."""
+    current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], repo)
+    _git_out(["checkout", branch], repo)
+    try:
+        # remove root-level + nested CLAUDE.md and known Archie dirs/files
+        subprocess.run(["git", "rm", "-r", "--quiet", "--ignore-unmatch",
+                        *ARCHIE_PATHS], cwd=str(repo), capture_output=True, text=True)
+        # nested per-folder CLAUDE.md files. Use -z (NUL-delimited) so paths
+        # containing spaces (e.g. Xcode "Button icons/") are not fragmented —
+        # splitting on whitespace would break them and git rm would skip them,
+        # leaking Archie context onto the control arm.
+        out = subprocess.run(["git", "ls-files", "-z", "*/CLAUDE.md"], cwd=str(repo),
+                             capture_output=True, text=True).stdout
+        nested = [p for p in out.split("\0") if p]
+        if nested:
+            subprocess.run(["git", "rm", "--quiet", "--ignore-unmatch", *nested],
+                           cwd=str(repo), capture_output=True, text=True)
+        if not _is_clean(repo):
+            _git_out(["commit", "-m", "benchmark: strip Archie artifacts (control arm)"], repo)
+    finally:
+        _git_out(["checkout", current], repo)
+
+
+def prepare_branches(cfg):
+    """Create control (no Archie) and treatment (with Archie) branches from current HEAD.
+
+    Returns a status dict; if Archie is absent, `needs_deep_scan` is True and the
+    caller (cli) must run the interactive deep-scan on the treatment branch.
+    """
+    repo = cfg.repo
+    if not _is_clean(repo):
+        raise ValueError("working tree is not clean; commit or stash before benchmarking")
+
+    base = _base_commit(repo)
+    archie_present = _archie_present(repo)
+
+    _create_branch(repo, cfg.branches["treatment"], base)
+    _create_branch(repo, cfg.branches["control"], base)
+
+    if archie_present:
+        _strip_archie_on_branch(repo, cfg.branches["control"])
+    # if absent, control already has no Archie files; treatment will be populated
+    # by the interactive deep-scan (cli handles the pause).
+
+    return {
+        "archie_present": archie_present,
+        "needs_deep_scan": not archie_present,
+        "base": base,
+        "branches": cfg.branches,
+    }
diff --git a/archie/benchmark/runner.py b/archie/benchmark/runner.py
new file mode 100644
index 00000000..e4475f6f
--- /dev/null
+++ b/archie/benchmark/runner.py
@@ -0,0 +1,55 @@
+# archie/benchmark/runner.py
+import subprocess
+from .metrics import parse_stream
+
+# Headless `claude -p` runs with no human in the loop. Global/project agent rules
+# ("describe your approach and wait for approval", "ask clarifying questions",
+# "stop and split tasks > 3 files") otherwise make the agent plan-and-stop without
+# editing, yielding an empty diff on both arms. This preamble — applied identically
+# to both arms, so it stays fair — forces autonomous completion. The same on-disk
+# Archie context (or its absence) is still what differs between arms.
+AUTONOMY_PREAMBLE = (
+    "You are running fully autonomously in a non-interactive headless session. "
+    "There is NO human available to answer questions or approve anything; if you "
+    "stop to ask or to wait for approval, the task simply fails.\n"
+    "Implement the task below completely and immediately:\n"
+    "- Edit the files directly to a finished, working state.\n"
+    "- Do NOT ask clarifying questions — make reasonable assumptions and proceed.\n"
+    "- Do NOT stop to present a plan or wait for approval; just do the work.\n"
+    "- Do NOT merely analyze, summarize, or hand off to a plan — produce the "
+    "actual code changes.\n"
+    "- Ignore any rule that says to pause, seek confirmation, or split the work "
+    "across sessions; finish it now, in this session.\n\n"
+    "=== TASK ===\n"
+)
+
+
+def build_prompt(task_prompt):
+    """Wrap the raw task in the autonomy preamble (identical for both arms)."""
+    return AUTONOMY_PREAMBLE + task_prompt
+
+
+def run_claude(prompt, model, cwd, timeout_seconds):
+    """Run a headless Claude Code session in `cwd`; return (SampleMetrics, raw_stdout).
+
+    Both benchmark arms must call this with identical flags — the only difference
+    between arms is the on-disk files (CLAUDE.md / .claude hooks), never the flags.
+    """
+    cmd = [
+        "claude", "-p", build_prompt(prompt),
+        "--model", model,
+        "--output-format", "stream-json", "--verbose",
+        "--permission-mode", "acceptEdits",
+    ]
+    try:
+        proc = subprocess.run(cmd, cwd=str(cwd), capture_output=True,
+                              text=True, timeout=timeout_seconds)
+        metrics = parse_stream(proc.stdout.splitlines())
+        return metrics, proc.stdout
+    except subprocess.TimeoutExpired as e:
+        partial = e.output or ""
+        if isinstance(partial, bytes):
+            partial = partial.decode("utf-8", "replace")
+        metrics = parse_stream(partial.splitlines())
+        metrics.completed = False
+        return metrics, partial
diff --git a/archie/benchmark/schema.sql b/archie/benchmark/schema.sql
new file mode 100644
index 00000000..62f117e0
--- /dev/null
+++ b/archie/benchmark/schema.sql
@@ -0,0 +1,62 @@
+-- archie/benchmark/schema.sql
+-- Archie benchmark harness — Supabase schema (v1).
+-- Run manually against the project (or via CI). Idempotent-ish: uses IF NOT EXISTS.
+
+create table if not exists benchmark_runs (
+    id              uuid primary key default gen_random_uuid(),
+    name            text not null,
+    repo_name       text,                 -- basename only, never a full path
+    task_prompt     text,
+    model           text,
+    judge_model     text,
+    repetitions     int,
+    git_base_commit text,
+    prep_cost_usd   numeric,              -- deep-scan prep cost, separate & best-effort
+    archie_version  text,
+    created_at      timestamptz not null default now()
+);
+
+create table if not exists benchmark_samples (
+    id                    uuid primary key default gen_random_uuid(),
+    run_id                uuid not null references benchmark_runs(id) on delete cascade,
+    arm                   text not null,  -- 'control' | 'treatment'
+    repetition            int,
+    tool_calls            int,
+    tool_breakdown        jsonb,
+    input_tokens          int,
+    output_tokens         int,
+    cache_read_tokens     int,
+    cache_creation_tokens int,
+    cost_usd              numeric,
+    duration_ms           int,
+    num_turns             int,
+    completed             boolean,
+    attempted             boolean,        -- agent produced a non-empty diff
+    quality_score         numeric,
+    quality_detail        jsonb,
+    judge_seed            int,
+    created_at            timestamptz not null default now()
+);
+
+create index if not exists benchmark_samples_run_id_idx on benchmark_samples(run_id);
+
+-- Per-run, per-arm rollup the website reads (separate spec).
+create or replace view benchmark_summary as
+select
+    r.id            as run_id,
+    r.name          as name,
+    r.repo_name     as repo_name,
+    r.model         as model,
+    s.arm           as arm,
+    count(*)                          as samples,
+    count(*) filter (where s.completed) as completed_samples,
+    count(*) filter (where s.attempted) as attempted_samples,
+    avg(s.tool_calls)                 as tool_calls_mean,
+    avg(s.cost_usd)                   as cost_usd_mean,
+    avg(s.duration_ms)                as duration_ms_mean,
+    avg(s.input_tokens + s.output_tokens) as total_tokens_mean,
+    -- quality only over real attempts (empty-diff runs excluded)
+    avg(s.quality_score) filter (where s.attempted) as quality_mean
+from benchmark_runs r
+join benchmark_samples s on s.run_id = r.id
+group by r.id, r.name, r.repo_name, r.model, s.arm;
diff --git a/archie/benchmark/secrets.env.example b/archie/benchmark/secrets.env.example
new file mode 100644
index 00000000..13c722a5
--- /dev/null
+++ b/archie/benchmark/secrets.env.example
@@ -0,0 +1,23 @@
+# Archie benchmark — Supabase credentials TEMPLATE.
+#
+# This file is tracked so others know what to provide. Do NOT put real keys here.
+# Copy it to a gitignored location and fill in your values:
+#
+#   cp archie/benchmark/secrets.env.example .archie-bench/secrets.env
+#   # then edit .archie-bench/secrets.env with your real URL + service_role key
+#
+# Load it before a run (exports the vars into the environment):
+#
+#   set -a; source .archie-bench/secrets.env; set +a
+#
+# .archie-bench/ is gitignored — keep real keys there, never commit a filled copy.
+#
+# Notes:
+# - URL: no trailing slash, no /rest/v1 — store.py appends /rest/v1/<table>.
+# - KEY: must be the service_role key (NOT anon) so inserts bypass RLS.
+#        Supabase -> Project Settings -> API -> service_role.
+# - One-time: apply archie/benchmark/schema.sql in the Supabase SQL Editor.
+# - Without these vars set, the harness runs OFFLINE (writes a local results.json).
+
+SUPABASE_URL=https://REPLACE-WITH-PROJECT-REF.supabase.co
+SUPABASE_SERVICE_KEY=REPLACE-WITH-SERVICE-ROLE-KEY
diff --git a/archie/benchmark/store.py b/archie/benchmark/store.py
new file mode 100644
index 00000000..3df6adae
--- /dev/null
+++ b/archie/benchmark/store.py
@@ -0,0 +1,43 @@
+# archie/benchmark/store.py
+import json
+import os
+import urllib.request
+from pathlib import Path
+
+
+def _env():
+    return os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_SERVICE_KEY")
+
+
+def _post(url, key, table, rows):
+    data = json.dumps(rows).encode("utf-8")
+    req = urllib.request.Request(
+        f"{url}/rest/v1/{table}",
+        data=data,
+        headers={
+            "apikey": key,
+            "Authorization": f"Bearer {key}",
+            "Content-Type": "application/json",
+            "Prefer": "return=representation",
+        },
+        method="POST",
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def store_results(run_row, sample_rows, offline_path, _poster=None):
+    url, key = _env()
+    if not url or not key:
+        path = Path(offline_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(json.dumps({"run": run_row, "samples": sample_rows}, indent=2))
+        return {"mode": "offline", "path": str(path)}
+
+    poster = _poster or _post
+    created = poster(url, key, "benchmark_runs", [run_row])
+    run_id = created[0]["id"]
+    for r in sample_rows:
+        r["run_id"] = run_id
+    poster(url, key, "benchmark_samples", sample_rows)
+    return {"mode": "online", "run_id": run_id}
diff --git a/docs/specs/2026-06-02-archie-benchmark-harness-design.md b/docs/specs/2026-06-02-archie-benchmark-harness-design.md
deleted file mode 100644
index f131bfa5..00000000
--- a/docs/specs/2026-06-02-archie-benchmark-harness-design.md
+++ /dev/null
@@ -1,231 +0,0 @@
-# Archie Benchmark Harness — Design
-
-**Date:** 2026-06-02
-**Status:** Approved (design) — pending implementation plan
-**Scope:** Internal benchmarking tool that measures Archie's effectiveness by running the *same* task with and without Archie's generated context, capturing efficiency + quality metrics, and storing results in Supabase. Website display is a **separate follow-up spec**.
-
----
-
-## 1. Purpose
-
-Prove (or disprove) Archie's value with hard numbers. For a given repository and a given coding task, run an identical headless Claude Code session in two arms:
-
-- **control** — repo without any Archie artifacts.
-- **treatment** — repo with the full Archie experience: root `CLAUDE.md` / `AGENTS.md`, per-folder `CLAUDE.md` context files, rules, **and** the real-time enforcement hooks.
-
-For each arm we capture **tool calls, tokens, cost, wall-clock duration** (efficiency) **and a blind judge-Claude quality score** (correctness/completeness/conventions). Measuring cost alone is misleading — an agent that does nothing is cheapest — so quality is a first-class output.
-
-The benchmark is an **internal tool** (the team runs it on controlled repos to produce marketing numbers). The Supabase write key lives in a local `.env` / CI secret; results are written directly. No end-user consent/anonymization layer is in scope.
-
----
-
-## 2. Key Decisions (resolved during brainstorming)
-
-| Decision | Choice |
-|---|---|
-| What we measure | Efficiency (tool calls, tokens, cost, time) **+ quality** (judge score) |
-| Execution engine | **Claude Code headless** — `claude -p ... --output-format stream-json` |
-| Treatment contents | **Everything**: context docs + rules + enforcement hooks |
-| Arm source | **Branch-based**: a treatment branch (with Archie files) and a control branch (without). Tool can prep both from a plain repo. |
-| Quality measurement | **Blind judge-Claude** scored against a rubric (no pre-written tests required) |
-| Repetitions | **Configurable, default 3** per arm (average + spread) |
-| Tool scope | **Internal** — direct Supabase write via service key |
-| Website | **Out of scope** — separate follow-up spec |
-| Deep-scan prep (when repo has no Archie yet) | **Semi-automatic**: tool prepares branch + installs Archie, then pauses for the user to run `/archie-deep-scan` interactively, then resumes |
-| Config format | **JSON** (zero-dep; YAML/TOML rejected — Archie targets Python 3.9+, stdlib only) |
-
-**Critical invariant:** the deep-scan that *generates* the treatment artifacts is **never** counted in the measured metrics. It runs before measurement, on a separate branch, and its cost is logged separately as `prep_cost`.
-
----
-
-## 3. Architecture
-
-New **internal** Python package (zero-dep stdlib; **not** copied into the npm package, **not** a `standalone/` script):
-
-```
-archie/benchmark/
-  __init__.py
-  cli.py          # entry: `python3 -m archie.benchmark {auto,run,prep} <args>`
-  config.py       # JSON config read + validation
-  isolation.py    # git worktree lifecycle: add / cwd / cleanup / prune
-  runner.py       # one `claude -p` run in a worktree, stream-json
-  metrics.py      # stream-json parse -> {tool_calls, tool_breakdown, tokens, cost, duration, turns, completed}
-  judge.py        # blind judge-Claude call -> rubric scores (forced JSON)
-  diff.py         # `git diff` + untracked files for an arm
-  store.py        # Supabase PostgREST write (urllib, service key from .env) + offline fallback
-  orchestrator.py # full run: prep -> (arm x repetition) matrix -> aggregate -> store -> summary
-  schema.sql      # versioned Supabase DDL (tables + summary view)
-tests/benchmark/  # pytest; claude/supabase/git external calls mocked
-```
-
-Each file has a single responsibility and is independently testable. `runner`, `judge`, and `store` wrap the only external side effects (claude CLI, HTTP) so they mock cleanly. `cli.py` is a thin arg-parse + `orchestrator` call.
-
----
-
-## 4. Config format (JSON)
-
-One file describes one benchmark case:
-
-```json
-{
-  "name": "bedtime-add-sleep-timer",
-  "repo": "/Users/csacsi/DEV/BedtimeApp",
-  "task_prompt": "Add a sleep timer feature: a setting that stops audio playback after a chosen duration. Wire it into the existing player and settings UI.",
-  "model": "claude-sonnet-4-6",
-  "repetitions": 3,
-  "branches": {
-    "treatment": "archie-bench/with-archie",
-    "control":   "archie-bench/no-archie"
-  },
-  "judge": {
-    "model": "claude-opus-4-8",
-    "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"]
-  },
-  "timeout_seconds": 3600
-}
-```
-
-Rules:
-
-- **`task_prompt` is byte-for-byte identical** across both arms and **never mentions Archie**. The presence of context files is the only difference.
-- `model` is the same for both arms (fixed, so the model is not a confounding variable).
-- `branches`: if both exist → start from them. If missing, the CLI offers prep (§6).
-- `judge.rubric`: customizable; each axis scored 1–10 plus a short justification.
-- `timeout_seconds`: hard cap per `claude -p` run (default **3600**), overridable.
-
----
-
-## 5. Data flow — one sample (one arm, one repetition)
-
-1. **Worktree:** `git worktree add <repo>/.archie/benchmark/worktrees/<branch>-<rep> <branch>` → fresh isolated checkout. Every repetition gets its own worktree (Claude mutates files; worktrees are not shared).
-2. **Run** in the worktree (`cwd=<worktree>`):
-   ```
-   claude -p "<task_prompt>" \
-     --model <model> \
-     --output-format stream-json --verbose \
-     --permission-mode acceptEdits
-   ```
-   Both arms get **identical** harness flags. The treatment arm picks up Archie hooks from the repo's `.claude/settings` and auto-loads `CLAUDE.md`; the control arm has neither — that is the measured difference.
-3. **Metrics** (`metrics.py`) from the stream-json events:
-   - **tool_calls**: count of `tool_use` blocks in `assistant` messages, **also broken down by type** (Edit / Read / Bash / …).
-   - **tokens**: from the final `result` event `usage`: `input`, `output`, `cache_read`, `cache_creation`.
-   - **cost**: `result.total_cost_usd`.
-   - **duration**: `result.duration_ms` (plus our own wall-clock as a sanity check).
-   - **turns**: `result.num_turns`.
-   - **completed**: `result.subtype == "success"` (not timeout/error).
-4. **Diff:** in the worktree, `git add -A && git diff --cached` → full change-set text, stored for the judge (kept after worktree removal).
-5. **Cleanup:** `git worktree remove --force` (in `finally`).
-
-Raw stream-json optionally saved to `.archie/benchmark/<name>/<branch>-<rep>.jsonl` for debugging.
-
----
-
-## 6. `auto` command — from a plain repo to finished numbers
-
-Entry: `python3 -m archie.benchmark auto <repo-path> --prompt "..."` (or task from a config file). The tool drives the whole flow:
-
-1. **Check:** repo is a clean git working tree (else stop, so uncommitted state never contaminates measurement). Record the base commit/branch.
-2. **Control branch** (`archie-bench/no-archie`): branch off the base. If Archie files exist (`CLAUDE.md`, `AGENTS.md`, `.claude/`, `.archie/`, per-folder `CLAUDE.md`s), **delete and commit** them. If absent, leave untouched.
-3. **Treatment branch** (`archie-bench/with-archie`): branch off the base.
-   - If Archie files already exist → use as-is.
-   - If absent → **semi-automatic prep**: the tool runs `npx @bitraptors/archie <repo>` (installs scripts + commands), then **pauses** with instructions: *"Open Claude Code on this worktree, run `/archie-deep-scan`, commit the results, then press Enter."* The user runs it interactively (more robust than headless deep-scan), returns, presses Enter. The tool **verifies** the Archie files now exist (fails clearly if not) and commits anything uncommitted.
-   - The deep-scan cost is **excluded from measurement**. Because prep is interactive (semi-automatic), the tool cannot directly meter its token cost; `prep_cost_usd` is **best-effort and nullable** — if `.archie/telemetry/` from the deep-scan run is present, the tool reads duration/cost from it, otherwise the field stays null. The point is only that prep is never folded into sample metrics.
-4. **Benchmark:** the `run` flow on both branches (default 3 repetitions, blind judge).
-5. **Aggregate + Supabase write + console summary.**
-
-Idempotent branch prep: if a `archie-bench/*` branch already exists, the tool asks (reuse / regenerate / abort) — never silently overwrites.
-
----
-
-## 7. Blind judge-Claude
-
-- The judge is a **separate `claude -p` call** with fresh context (does not see the benchmark runs or Archie).
-- Input: the `task_prompt` + both arms' diffs labeled **"Variant A" / "Variant B" in a randomized order** (the tool records the mapping). The judge cannot tell which is the Archie arm → no bias.
-- Randomization uses a **fixed seed** derived from the sample id (no time/`random`-without-seed dependence), stored as `judge_seed` for reproducibility.
-- Output is **forced JSON**: per-rubric-axis score 1–10 + short justification + overall score. `judge.py` validates; on malformed JSON it retries **once**.
-- Scoring is **pairwise per repetition**: each (A, B) pair → one judge call (N calls, not N²). Per-arm judge scores are averaged.
-- `judge.model` defaults to Opus (stronger judge), overridable in config.
-
----
-
-## 8. Supabase schema
-
-Two tables in the existing project, written directly via PostgREST with the **service key** (`.env`: `SUPABASE_URL`, `SUPABASE_SERVICE_KEY`).
-
-**`benchmark_runs`** — one row per benchmark run:
-
-```
-id              uuid pk (default gen_random_uuid())
-name            text          -- config.name
-repo_name       text          -- repo basename only (not full path)
-task_prompt     text
-model           text
-judge_model     text
-repetitions     int
-git_base_commit text          -- base commit (reproducibility)
-prep_cost_usd   numeric null  -- deep-scan prep cost, SEPARATE & best-effort (null if not metered)
-archie_version  text
-created_at      timestamptz default now()
-```
-
-**`benchmark_samples`** — one row per (arm × repetition):
-
-```
-id                    uuid pk
-run_id                uuid fk -> benchmark_runs.id
-arm                   text          -- 'control' | 'treatment'
-repetition            int
-tool_calls            int
-tool_breakdown        jsonb         -- {"Edit":4,"Read":9,"Bash":2,...}
-input_tokens          int
-output_tokens         int
-cache_read_tokens     int
-cache_creation_tokens int
-cost_usd              numeric
-duration_ms           int
-num_turns             int
-completed             bool          -- result.subtype == success
-quality_score         numeric null  -- judge overall (0–10)
-quality_detail        jsonb null    -- per-axis breakdown + justification
-judge_seed            int
-created_at            timestamptz default now()
-```
-
-- Aggregates (per-arm mean/spread/savings-%) are **not** stored twice — a DB **view** `benchmark_summary` computes them from samples; the website (separate spec) reads the view.
-- DDL ships as versioned `archie/benchmark/schema.sql` (run against Supabase manually / in CI).
-- `store.py`: if `.env` keys are missing → **does not crash**; saves locally to `.archie/benchmark/<name>/results.json` and warns (offline mode).
-
----
-
-## 9. Error handling, isolation safety, cleanup
-
-- **Worktree-leak protection:** every worktree is created and removed in `try/finally` (`git worktree remove --force`). `git worktree prune` at run start and end. Temp root is a known location (`<repo>/.archie/benchmark/worktrees/`) so leftovers are cleanable on restart.
-- **One failed sample does not sink the run:** if a `claude -p` times out/errors, that sample is recorded with `completed=false` and partial metrics; others continue. The aggregate reports how many samples dropped.
-- **Fairness guards:** the tool verifies (a) `task_prompt` is byte-identical across arms, (b) `model` and harness flags are identical, (c) both branches descend from the same `git_base_commit`. Any violation → stop, do not write noisy data.
-- **Prep separation:** deep-scan prep happens entirely before measurement, on a separate branch; measured `claude -p` runs start from fresh worktrees where Archie files are already committed — prep tokens/time never leak into sample metrics.
-- **Secrets:** `.env` is never logged; only `repo_name` (basename) goes to the DB, not the full path.
-
----
-
-## 10. Testing
-
-`tests/benchmark/`, pytest, all external calls mocked (no real `claude`/Supabase in tests):
-
-- `metrics.py`: fixed stream-json fixtures (success, timeout, tool-heavy, zero-tool) → correct tool count, token sums, completed flag.
-- `config.py`: valid/invalid configs, missing fields, identical-prompt invariant.
-- `isolation.py`: worktree add/remove on a throwaway temp git repo (may run real git, fast).
-- `diff.py`: known change → expected diff text, untracked files included.
-- `judge.py`: mocked judge response parse, malformed JSON → 1 retry, seed determinism.
-- `store.py`: mocked HTTP → correct payload shape; missing `.env` → offline fallback file.
-- `orchestrator.py`: end-to-end with mocks (fake runner+judge+store) → matrix and aggregation correct; a failed sample does not sink the run.
-
-**Edge cases covered:** empty diff (Claude did nothing) → `completed=true` but low quality; mid-run timeout; both arms identical; missing Supabase key; pre-existing benchmark branch; non-clean working tree; missing Archie files after deep-scan prep (verification fails).
-
----
-
-## 11. Out of scope (explicit)
-
-- Website / dashboard display of results — **separate follow-up spec** (will read the `benchmark_summary` view).
-- End-user-facing shipped benchmark (consent gating, anonymization, edge-function ingest).
-- Anthropic Agent SDK / raw API execution paths (headless Claude Code only).
-- Automatic headless deep-scan (semi-automatic interactive prep chosen instead).
diff --git a/docs/specs/2026-06-02-archie-benchmark-harness-plan.md b/docs/specs/2026-06-02-archie-benchmark-harness-plan.md
deleted file mode 100644
index b7615de9..00000000
--- a/docs/specs/2026-06-02-archie-benchmark-harness-plan.md
+++ /dev/null
@@ -1,1899 +0,0 @@
-# Archie Benchmark Harness Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Build an internal Python tool that runs an identical headless Claude Code task on a control branch (no Archie) and a treatment branch (full Archie docs+hooks), captures tool calls / tokens / cost / time + a blind judge-Claude quality score, and stores results in Supabase.
-
-**Architecture:** Zero-dep stdlib Python package `archie/benchmark/`. Each module has one responsibility: config parsing, git-worktree isolation, headless `claude -p` execution, stream-json metric extraction, diff capture, blind judge scoring, Supabase write, and an orchestrator that runs the (arm × repetition) matrix. External side effects (the `claude` CLI, git, Supabase HTTP) are isolated behind functions that accept injectable dependencies so tests mock them.
-
-**Tech Stack:** Python 3.9+ (stdlib only — `json`, `subprocess`, `urllib`, `dataclasses`, `pathlib`, `contextlib`, `hashlib`, `argparse`), pytest, Claude Code CLI (`claude -p`), Supabase PostgREST.
-
-**Spec:** `docs/specs/2026-06-02-archie-benchmark-harness-design.md`
-
----
-
-## File Structure
-
-```
-archie/benchmark/
-  __init__.py        # package marker, exports
-  config.py          # BenchmarkConfig + JudgeConfig dataclasses, load/parse/validate
-  metrics.py         # SampleMetrics dataclass + parse_stream(lines)
-  diff.py            # capture_diff(worktree_path) -> str
-  isolation.py       # worktree() contextmanager + prune()
-  runner.py          # run_claude(...) -> (SampleMetrics, raw_stdout)
-  judge.py           # run_judge(...) -> per-arm rubric scores (blind, seeded A/B)
-  store.py           # store_results(...) -> Supabase write or offline fallback
-  aggregate.py       # aggregate_samples(samples) -> per-arm means/spread
-  orchestrator.py    # run_benchmark(config, deps...) + prepare_branches(...)
-  cli.py             # argparse entry: run / auto / prep
-  schema.sql         # versioned Supabase DDL (tables + summary view)
-tests/benchmark/
-  __init__.py
-  test_config.py
-  test_metrics.py
-  test_diff.py
-  test_isolation.py
-  test_runner.py
-  test_judge.py
-  test_store.py
-  test_aggregate.py
-  test_orchestrator.py
-```
-
-**Conventions to follow (from existing `archie/standalone/`):** zero third-party imports, `subprocess.run(..., capture_output=True, text=True)`, defensive `.get()` on parsed JSON, no secrets in logs.
-
----
-
-## Shared Type Contracts (defined once, used everywhere)
-
-These exact shapes are used across tasks — keep names identical.
-
-- `BenchmarkConfig`: `name:str, repo:Path, task_prompt:str, model:str, branches:dict{"treatment":str,"control":str}, repetitions:int, judge:JudgeConfig, timeout_seconds:int`
-- `JudgeConfig`: `model:str, rubric:list[str]`
-- `SampleMetrics`: `tool_calls:int, tool_breakdown:dict[str,int], input_tokens:int, output_tokens:int, cache_read_tokens:int, cache_creation_tokens:int, cost_usd:float, duration_ms:int, num_turns:int, completed:bool`
-- Judge result dict: `{"treatment": {<axis>:int,..,"overall":float,"justification":str}, "control": {...}, "seed": int}`
-- Sample row dict (for store): `{arm, repetition, tool_calls, tool_breakdown, input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens, cost_usd, duration_ms, num_turns, completed, quality_score, quality_detail, judge_seed}`
-
----
-
-### Task 1: Package scaffold + config
-
-**Files:**
-- Create: `archie/benchmark/__init__.py`
-- Create: `archie/benchmark/config.py`
-- Create: `tests/benchmark/__init__.py`
-- Test: `tests/benchmark/test_config.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_config.py
-import json
-import pytest
-from pathlib import Path
-from archie.benchmark.config import parse_config, load_config, BenchmarkConfig
-
-
-def _valid():
-    return {
-        "name": "demo",
-        "repo": "/tmp/repo",
-        "task_prompt": "Add a feature",
-        "model": "claude-sonnet-4-6",
-    }
-
-
-def test_parse_minimal_applies_defaults():
-    cfg = parse_config(_valid())
-    assert isinstance(cfg, BenchmarkConfig)
-    assert cfg.repetitions == 3
-    assert cfg.timeout_seconds == 3600
-    assert cfg.branches == {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}
-    assert cfg.judge.model == "claude-opus-4-8"
-    assert "correctness" in cfg.judge.rubric
-    assert isinstance(cfg.repo, Path)
-
-
-def test_parse_overrides():
-    data = _valid()
-    data.update({
-        "repetitions": 5,
-        "timeout_seconds": 1200,
-        "branches": {"treatment": "t", "control": "c"},
-        "judge": {"model": "m", "rubric": ["x"]},
-    })
-    cfg = parse_config(data)
-    assert cfg.repetitions == 5
-    assert cfg.timeout_seconds == 1200
-    assert cfg.branches == {"treatment": "t", "control": "c"}
-    assert cfg.judge.model == "m"
-    assert cfg.judge.rubric == ["x"]
-
-
-@pytest.mark.parametrize("missing", ["name", "repo", "task_prompt", "model"])
-def test_missing_required_raises(missing):
-    data = _valid()
-    del data[missing]
-    with pytest.raises(ValueError, match="required"):
-        parse_config(data)
-
-
-def test_repetitions_must_be_positive():
-    data = _valid()
-    data["repetitions"] = 0
-    with pytest.raises(ValueError, match="repetitions"):
-        parse_config(data)
-
-
-def test_branches_missing_arm_raises():
-    data = _valid()
-    data["branches"] = {"treatment": "t"}
-    with pytest.raises(ValueError, match="control"):
-        parse_config(data)
-
-
-def test_load_config_reads_file(tmp_path):
-    p = tmp_path / "c.json"
-    p.write_text(json.dumps(_valid()))
-    cfg = load_config(p)
-    assert cfg.name == "demo"
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_config.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.config'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/__init__.py
-"""Internal Archie effectiveness benchmark harness (not shipped via npm)."""
-```
-
-```python
-# archie/benchmark/config.py
-import json
-from dataclasses import dataclass, field
-from pathlib import Path
-
-DEFAULT_JUDGE_MODEL = "claude-opus-4-8"
-DEFAULT_RUBRIC = ["correctness", "completeness", "follows_conventions", "no_regressions"]
-DEFAULT_BRANCHES = {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}
-DEFAULT_TIMEOUT = 3600
-DEFAULT_REPETITIONS = 3
-REQUIRED = ("name", "repo", "task_prompt", "model")
-
-
-@dataclass
-class JudgeConfig:
-    model: str = DEFAULT_JUDGE_MODEL
-    rubric: list = field(default_factory=lambda: list(DEFAULT_RUBRIC))
-
-
-@dataclass
-class BenchmarkConfig:
-    name: str
-    repo: Path
-    task_prompt: str
-    model: str
-    branches: dict = field(default_factory=lambda: dict(DEFAULT_BRANCHES))
-    repetitions: int = DEFAULT_REPETITIONS
-    judge: JudgeConfig = field(default_factory=JudgeConfig)
-    timeout_seconds: int = DEFAULT_TIMEOUT
-
-
-def parse_config(data):
-    missing = [k for k in REQUIRED if k not in data or data[k] in (None, "")]
-    if missing:
-        raise ValueError(f"config missing required fields: {', '.join(missing)}")
-
-    branches = data.get("branches", dict(DEFAULT_BRANCHES))
-    for arm in ("treatment", "control"):
-        if arm not in branches or not branches[arm]:
-            raise ValueError(f"config.branches missing '{arm}'")
-
-    reps = int(data.get("repetitions", DEFAULT_REPETITIONS))
-    if reps < 1:
-        raise ValueError("repetitions must be >= 1")
-
-    jd = data.get("judge", {}) or {}
-    judge = JudgeConfig(
-        model=jd.get("model") or DEFAULT_JUDGE_MODEL,
-        rubric=jd.get("rubric") or list(DEFAULT_RUBRIC),
-    )
-
-    return BenchmarkConfig(
-        name=data["name"],
-        repo=Path(data["repo"]).expanduser(),
-        task_prompt=data["task_prompt"],
-        model=data["model"],
-        branches={"treatment": branches["treatment"], "control": branches["control"]},
-        repetitions=reps,
-        judge=judge,
-        timeout_seconds=int(data.get("timeout_seconds", DEFAULT_TIMEOUT)),
-    )
-
-
-def load_config(path):
-    return parse_config(json.loads(Path(path).read_text()))
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_config.py -v`
-Expected: PASS (all 9 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/__init__.py archie/benchmark/config.py tests/benchmark/__init__.py tests/benchmark/test_config.py
-git commit -m "feat(benchmark): config dataclasses + JSON parsing/validation"
-```
-
----
-
-### Task 2: Stream-json metric extraction
-
-**Files:**
-- Create: `archie/benchmark/metrics.py`
-- Test: `tests/benchmark/test_metrics.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_metrics.py
-import json
-from archie.benchmark.metrics import parse_stream, SampleMetrics
-
-
-def _assistant(blocks):
-    return json.dumps({"type": "assistant", "message": {"content": blocks}})
-
-
-def _tool_use(name):
-    return {"type": "tool_use", "name": name, "id": "x", "input": {}}
-
-
-def _result(subtype="success"):
-    return json.dumps({
-        "type": "result",
-        "subtype": subtype,
-        "total_cost_usd": 0.1234,
-        "duration_ms": 5000,
-        "num_turns": 7,
-        "usage": {
-            "input_tokens": 100,
-            "output_tokens": 200,
-            "cache_read_input_tokens": 50,
-            "cache_creation_input_tokens": 25,
-        },
-    })
-
-
-def test_counts_tools_and_breakdown():
-    lines = [
-        json.dumps({"type": "system", "subtype": "init"}),
-        _assistant([{"type": "text", "text": "hi"}, _tool_use("Read")]),
-        _assistant([_tool_use("Edit"), _tool_use("Edit")]),
-        _result(),
-    ]
-    m = parse_stream(lines)
-    assert m.tool_calls == 3
-    assert m.tool_breakdown == {"Read": 1, "Edit": 2}
-
-
-def test_extracts_result_fields():
-    m = parse_stream([_result()])
-    assert m.input_tokens == 100
-    assert m.output_tokens == 200
-    assert m.cache_read_tokens == 50
-    assert m.cache_creation_tokens == 25
-    assert m.cost_usd == 0.1234
-    assert m.duration_ms == 5000
-    assert m.num_turns == 7
-    assert m.completed is True
-
-
-def test_error_result_not_completed():
-    m = parse_stream([_result(subtype="error_max_turns")])
-    assert m.completed is False
-
-
-def test_zero_tool_run():
-    m = parse_stream([_assistant([{"type": "text", "text": "done"}]), _result()])
-    assert m.tool_calls == 0
-    assert m.tool_breakdown == {}
-
-
-def test_ignores_blank_and_malformed_lines():
-    m = parse_stream(["", "  ", "not json", _result()])
-    assert m.completed is True
-
-
-def test_no_result_event_defaults():
-    m = parse_stream([_assistant([_tool_use("Bash")])])
-    assert m.tool_calls == 1
-    assert m.completed is False
-    assert m.cost_usd == 0.0
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_metrics.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.metrics'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/metrics.py
-import json
-from dataclasses import dataclass, field
-
-
-@dataclass
-class SampleMetrics:
-    tool_calls: int = 0
-    tool_breakdown: dict = field(default_factory=dict)
-    input_tokens: int = 0
-    output_tokens: int = 0
-    cache_read_tokens: int = 0
-    cache_creation_tokens: int = 0
-    cost_usd: float = 0.0
-    duration_ms: int = 0
-    num_turns: int = 0
-    completed: bool = False
-
-
-def parse_stream(lines):
-    m = SampleMetrics()
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            ev = json.loads(line)
-        except json.JSONDecodeError:
-            continue
-        etype = ev.get("type")
-        if etype == "assistant":
-            for block in ev.get("message", {}).get("content", []) or []:
-                if isinstance(block, dict) and block.get("type") == "tool_use":
-                    m.tool_calls += 1
-                    name = block.get("name", "unknown")
-                    m.tool_breakdown[name] = m.tool_breakdown.get(name, 0) + 1
-        elif etype == "result":
-            usage = ev.get("usage", {}) or {}
-            m.input_tokens = usage.get("input_tokens", 0)
-            m.output_tokens = usage.get("output_tokens", 0)
-            m.cache_read_tokens = usage.get("cache_read_input_tokens", 0)
-            m.cache_creation_tokens = usage.get("cache_creation_input_tokens", 0)
-            m.cost_usd = ev.get("total_cost_usd", 0.0)
-            m.duration_ms = ev.get("duration_ms", 0)
-            m.num_turns = ev.get("num_turns", 0)
-            m.completed = ev.get("subtype") == "success"
-    return m
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_metrics.py -v`
-Expected: PASS (6 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/metrics.py tests/benchmark/test_metrics.py
-git commit -m "feat(benchmark): stream-json metric extraction (tools, tokens, cost, completed)"
-```
-
----
-
-### Task 3: Diff capture
-
-**Files:**
-- Create: `archie/benchmark/diff.py`
-- Test: `tests/benchmark/test_diff.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_diff.py
-import subprocess
-from archie.benchmark.diff import capture_diff
-
-
-def _git(args, cwd):
-    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
-
-
-def _init_repo(path):
-    _git(["init"], path)
-    _git(["config", "user.email", "t@t.t"], path)
-    _git(["config", "user.name", "t"], path)
-    (path / "a.txt").write_text("one\n")
-    _git(["add", "-A"], path)
-    _git(["commit", "-m", "init"], path)
-
-
-def test_captures_modified_and_untracked(tmp_path):
-    _init_repo(tmp_path)
-    (tmp_path / "a.txt").write_text("one\ntwo\n")      # modified, tracked
-    (tmp_path / "b.txt").write_text("new file\n")       # untracked
-    diff = capture_diff(tmp_path)
-    assert "a.txt" in diff
-    assert "two" in diff
-    assert "b.txt" in diff
-    assert "new file" in diff
-
-
-def test_empty_diff_when_no_changes(tmp_path):
-    _init_repo(tmp_path)
-    diff = capture_diff(tmp_path)
-    assert diff.strip() == ""
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_diff.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.diff'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/diff.py
-import subprocess
-
-
-def capture_diff(worktree_path):
-    """Stage everything (so untracked files show) and return the cached diff text."""
-    subprocess.run(["git", "add", "-A"], cwd=str(worktree_path),
-                   check=True, capture_output=True, text=True)
-    result = subprocess.run(["git", "diff", "--cached"], cwd=str(worktree_path),
-                            check=True, capture_output=True, text=True)
-    return result.stdout
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_diff.py -v`
-Expected: PASS (2 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/diff.py tests/benchmark/test_diff.py
-git commit -m "feat(benchmark): capture full diff (modified + untracked) from a worktree"
-```
-
----
-
-### Task 4: Git worktree isolation
-
-**Files:**
-- Create: `archie/benchmark/isolation.py`
-- Test: `tests/benchmark/test_isolation.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_isolation.py
-import subprocess
-from pathlib import Path
-from archie.benchmark.isolation import worktree, prune
-
-
-def _git(args, cwd):
-    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
-
-
-def _init_repo(path):
-    _git(["init"], path)
-    _git(["config", "user.email", "t@t.t"], path)
-    _git(["config", "user.name", "t"], path)
-    (path / "a.txt").write_text("one\n")
-    _git(["add", "-A"], path)
-    _git(["commit", "-m", "init"], path)
-    _git(["branch", "feature"], path)
-
-
-def test_worktree_created_and_removed(tmp_path):
-    repo = tmp_path / "repo"
-    repo.mkdir()
-    _init_repo(repo)
-    dest = tmp_path / "wt"
-    with worktree(repo, "feature", dest) as wt:
-        assert Path(wt).exists()
-        assert (Path(wt) / "a.txt").exists()
-    assert not Path(dest).exists()
-
-
-def test_worktree_removed_on_exception(tmp_path):
-    repo = tmp_path / "repo"
-    repo.mkdir()
-    _init_repo(repo)
-    dest = tmp_path / "wt"
-    try:
-        with worktree(repo, "feature", dest):
-            raise RuntimeError("boom")
-    except RuntimeError:
-        pass
-    assert not Path(dest).exists()
-
-
-def test_prune_runs_without_error(tmp_path):
-    repo = tmp_path / "repo"
-    repo.mkdir()
-    _init_repo(repo)
-    prune(repo)  # must not raise
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_isolation.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.isolation'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/isolation.py
-import subprocess
-from contextlib import contextmanager
-from pathlib import Path
-
-
-@contextmanager
-def worktree(repo_path, branch, dest):
-    """Create a git worktree for `branch` at `dest`, always removed on exit."""
-    dest = Path(dest)
-    subprocess.run(["git", "worktree", "add", "--force", str(dest), branch],
-                   cwd=str(repo_path), check=True, capture_output=True, text=True)
-    try:
-        yield dest
-    finally:
-        subprocess.run(["git", "worktree", "remove", "--force", str(dest)],
-                       cwd=str(repo_path), capture_output=True, text=True)
-
-
-def prune(repo_path):
-    subprocess.run(["git", "worktree", "prune"],
-                   cwd=str(repo_path), capture_output=True, text=True)
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_isolation.py -v`
-Expected: PASS (3 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/isolation.py tests/benchmark/test_isolation.py
-git commit -m "feat(benchmark): git worktree contextmanager with guaranteed cleanup"
-```
-
----
-
-### Task 5: Headless claude runner
-
-**Files:**
-- Create: `archie/benchmark/runner.py`
-- Test: `tests/benchmark/test_runner.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_runner.py
-import json
-import subprocess
-import pytest
-from archie.benchmark import runner
-
-
-def _stream():
-    return "\n".join([
-        json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}),
-        json.dumps({"type": "result", "subtype": "success", "total_cost_usd": 0.5,
-                    "duration_ms": 1000, "num_turns": 2,
-                    "usage": {"input_tokens": 10, "output_tokens": 20}}),
-    ])
-
-
-def test_run_claude_parses_metrics(monkeypatch):
-    captured = {}
-
-    def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None):
-        captured["cmd"] = cmd
-        captured["cwd"] = cwd
-        return subprocess.CompletedProcess(cmd, 0, stdout=_stream(), stderr="")
-
-    monkeypatch.setattr(runner.subprocess, "run", fake_run)
-    metrics, raw = runner.run_claude("do it", "claude-sonnet-4-6", "/tmp/wt", 60)
-
-    assert metrics.tool_calls == 1
-    assert metrics.cost_usd == 0.5
-    assert metrics.completed is True
-    assert captured["cwd"] == "/tmp/wt"
-    # identical, fair harness flags must always be present
-    assert captured["cmd"][:2] == ["claude", "-p"]
-    assert "--permission-mode" in captured["cmd"]
-    assert "acceptEdits" in captured["cmd"]
-    assert "stream-json" in captured["cmd"]
-    assert "--model" in captured["cmd"] and "claude-sonnet-4-6" in captured["cmd"]
-
-
-def test_timeout_marks_incomplete(monkeypatch):
-    def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None):
-        raise subprocess.TimeoutExpired(cmd, timeout, output=_stream())
-
-    monkeypatch.setattr(runner.subprocess, "run", fake_run)
-    metrics, raw = runner.run_claude("do it", "m", "/tmp/wt", 1)
-    assert metrics.completed is False
-    assert metrics.tool_calls == 1  # partial stdout still parsed
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_runner.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.runner'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/runner.py
-import subprocess
-from .metrics import parse_stream
-
-
-def run_claude(prompt, model, cwd, timeout_seconds):
-    """Run a headless Claude Code session in `cwd`; return (SampleMetrics, raw_stdout).
-
-    Both benchmark arms must call this with identical flags — the only difference
-    between arms is the on-disk files (CLAUDE.md / .claude hooks), never the flags.
-    """
-    cmd = [
-        "claude", "-p", prompt,
-        "--model", model,
-        "--output-format", "stream-json", "--verbose",
-        "--permission-mode", "acceptEdits",
-    ]
-    try:
-        proc = subprocess.run(cmd, cwd=str(cwd), capture_output=True,
-                              text=True, timeout=timeout_seconds)
-        metrics = parse_stream(proc.stdout.splitlines())
-        return metrics, proc.stdout
-    except subprocess.TimeoutExpired as e:
-        partial = e.output or ""
-        if isinstance(partial, bytes):
-            partial = partial.decode("utf-8", "replace")
-        metrics = parse_stream(partial.splitlines())
-        metrics.completed = False
-        return metrics, partial
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_runner.py -v`
-Expected: PASS (2 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/runner.py tests/benchmark/test_runner.py
-git commit -m "feat(benchmark): headless claude -p runner with timeout handling"
-```
-
----
-
-### Task 6: Blind judge
-
-**Files:**
-- Create: `archie/benchmark/judge.py`
-- Test: `tests/benchmark/test_judge.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_judge.py
-import json
-import pytest
-from archie.benchmark import judge
-
-
-def test_assign_order_is_seed_deterministic():
-    assert judge.assign_order(0) == ("a", "b")
-    assert judge.assign_order(2) == ("a", "b")
-    assert judge.assign_order(1) == ("b", "a")
-    assert judge.assign_order(3) == ("b", "a")
-
-
-def test_parse_judge_output_extracts_embedded_json():
-    text = 'Here is my verdict:\n{"variant_a": {"overall": 8}, "variant_b": {"overall": 5}}\nThanks'
-    parsed = judge.parse_judge_output(text)
-    assert parsed["variant_a"]["overall"] == 8
-
-
-def test_parse_judge_output_raises_without_json():
-    with pytest.raises(ValueError, match="JSON"):
-        judge.parse_judge_output("no json here")
-
-
-def test_run_judge_maps_variants_to_arms_seed_even():
-    # seed even -> treatment is variant_a
-    payload = json.dumps({"variant_a": {"overall": 9}, "variant_b": {"overall": 4}})
-    calls = []
-
-    def fake_runner(prompt, model, timeout):
-        calls.append((prompt, model))
-        return payload
-
-    result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF",
-                             rubric=["correctness"], model="m", seed=0,
-                             _runner=fake_runner)
-    assert result["treatment"]["overall"] == 9
-    assert result["control"]["overall"] == 4
-    assert result["seed"] == 0
-    # variant A diff (shown first) must be the treatment diff for an even seed
-    assert calls[0][0].index("TREAT_DIFF") < calls[0][0].index("CTRL_DIFF")
-
-
-def test_run_judge_maps_variants_to_arms_seed_odd():
-    # seed odd -> treatment is variant_b
-    payload = json.dumps({"variant_a": {"overall": 3}, "variant_b": {"overall": 7}})
-    result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF",
-                             rubric=["correctness"], model="m", seed=1,
-                             _runner=lambda p, m, t: payload)
-    assert result["treatment"]["overall"] == 7
-    assert result["control"]["overall"] == 3
-
-
-def test_run_judge_retries_once_on_bad_json():
-    outputs = ["garbage", json.dumps({"variant_a": {"overall": 6}, "variant_b": {"overall": 6}})]
-
-    def flaky(prompt, model, timeout):
-        return outputs.pop(0)
-
-    result = judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0, _runner=flaky)
-    assert result["treatment"]["overall"] == 6
-    assert outputs == []  # both outputs consumed -> retried exactly once
-
-
-def test_run_judge_raises_after_two_failures():
-    with pytest.raises(ValueError):
-        judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0,
-                        _runner=lambda p, m, t: "still garbage")
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_judge.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.judge'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/judge.py
-import json
-import subprocess
-
-
-def assign_order(seed):
-    """Return (treatment_variant, control_variant) — blind A/B label assignment."""
-    return ("a", "b") if seed % 2 == 0 else ("b", "a")
-
-
-def build_judge_prompt(task_prompt, diff_a, diff_b, rubric):
-    axes = ", ".join(rubric)
-    schema = ('{"variant_a": {' + ", ".join(f'"{a}": int' for a in rubric)
-              + ', "overall": number, "justification": string}, "variant_b": {... same keys ...}}')
-    return (
-        "You are an impartial senior code reviewer. Two AI agents independently "
-        "attempted the SAME task. You are shown each agent's diff as an anonymous "
-        "variant. Judge purely on the code; you do not know anything about how each "
-        "was produced.\n\n"
-        f"TASK GIVEN TO BOTH AGENTS:\n{task_prompt}\n\n"
-        f"Score each variant on these axes (each 1-10): {axes}. Also give an "
-        "'overall' score (0-10) and a one-sentence 'justification'.\n\n"
-        f"Respond with ONLY a JSON object of this exact shape:\n{schema}\n\n"
-        f"=== VARIANT A DIFF ===\n{diff_a}\n\n"
-        f"=== VARIANT B DIFF ===\n{diff_b}\n"
-    )
-
-
-def parse_judge_output(text):
-    start = text.find("{")
-    end = text.rfind("}")
-    if start == -1 or end == -1 or end < start:
-        raise ValueError("no JSON object found in judge output")
-    return json.loads(text[start:end + 1])
-
-
-def _default_runner(prompt, model, timeout):
-    proc = subprocess.run(
-        ["claude", "-p", prompt, "--model", model, "--output-format", "text"],
-        capture_output=True, text=True, timeout=timeout,
-    )
-    return proc.stdout
-
-
-def run_judge(task_prompt, treatment_diff, control_diff, rubric, model, seed,
-              timeout_seconds=600, _runner=None):
-    t_variant, c_variant = assign_order(seed)
-    diff_a = treatment_diff if t_variant == "a" else control_diff
-    diff_b = treatment_diff if t_variant == "b" else control_diff
-    prompt = build_judge_prompt(task_prompt, diff_a, diff_b, rubric)
-
-    runner = _runner or _default_runner
-    parsed = None
-    last_err = None
-    for _ in range(2):
-        try:
-            parsed = parse_judge_output(runner(prompt, model, timeout_seconds))
-            break
-        except (ValueError, json.JSONDecodeError) as e:
-            last_err = e
-    if parsed is None:
-        raise ValueError(f"judge returned unparseable output twice: {last_err}")
-
-    return {
-        "treatment": parsed["variant_a"] if t_variant == "a" else parsed["variant_b"],
-        "control": parsed["variant_a"] if c_variant == "a" else parsed["variant_b"],
-        "seed": seed,
-    }
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_judge.py -v`
-Expected: PASS (7 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/judge.py tests/benchmark/test_judge.py
-git commit -m "feat(benchmark): blind seeded judge with A/B randomization + retry"
-```
-
----
-
-### Task 7: Aggregation
-
-**Files:**
-- Create: `archie/benchmark/aggregate.py`
-- Test: `tests/benchmark/test_aggregate.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_aggregate.py
-from archie.benchmark.aggregate import aggregate_samples
-
-
-def _s(arm, cost, tools, quality, completed=True):
-    return {"arm": arm, "cost_usd": cost, "tool_calls": tools,
-            "duration_ms": 1000, "input_tokens": 10, "output_tokens": 20,
-            "quality_score": quality, "completed": completed}
-
-
-def test_per_arm_means():
-    samples = [
-        _s("treatment", 1.0, 10, 8.0),
-        _s("treatment", 3.0, 20, 9.0),
-        _s("control", 2.0, 30, 6.0),
-        _s("control", 4.0, 40, 7.0),
-    ]
-    agg = aggregate_samples(samples)
-    assert agg["treatment"]["cost_usd_mean"] == 2.0
-    assert agg["treatment"]["tool_calls_mean"] == 15.0
-    assert agg["treatment"]["quality_mean"] == 8.5
-    assert agg["control"]["cost_usd_mean"] == 3.0
-    assert agg["treatment"]["n"] == 2
-    assert agg["treatment"]["completed_n"] == 2
-
-
-def test_savings_percentages():
-    samples = [_s("treatment", 1.0, 10, 8.0), _s("control", 2.0, 20, 8.0)]
-    agg = aggregate_samples(samples)
-    # treatment cost is 50% lower than control
-    assert agg["savings"]["cost_pct"] == 50.0
-    assert agg["savings"]["tool_calls_pct"] == 50.0
-
-
-def test_quality_ignores_none_scores():
-    samples = [
-        _s("treatment", 1.0, 10, None, completed=False),
-        _s("treatment", 1.0, 10, 8.0),
-        _s("control", 1.0, 10, 6.0),
-    ]
-    agg = aggregate_samples(samples)
-    assert agg["treatment"]["quality_mean"] == 8.0  # None excluded
-    assert agg["treatment"]["completed_n"] == 1
-
-
-def test_handles_empty_arm():
-    samples = [_s("treatment", 1.0, 10, 8.0)]
-    agg = aggregate_samples(samples)
-    assert agg["control"]["n"] == 0
-    assert agg["control"]["cost_usd_mean"] is None
-    assert agg["savings"]["cost_pct"] is None
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_aggregate.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.aggregate'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/aggregate.py
-NUMERIC_FIELDS = ["cost_usd", "tool_calls", "duration_ms", "input_tokens", "output_tokens"]
-
-
-def _mean(values):
-    return sum(values) / len(values) if values else None
-
-
-def _arm_stats(samples):
-    stats = {"n": len(samples), "completed_n": sum(1 for s in samples if s.get("completed"))}
-    for f in NUMERIC_FIELDS:
-        vals = [s[f] for s in samples if s.get(f) is not None]
-        stats[f + "_mean"] = _mean(vals)
-    qvals = [s["quality_score"] for s in samples if s.get("quality_score") is not None]
-    stats["quality_mean"] = _mean(qvals)
-    return stats
-
-
-def _pct_lower(treatment, control):
-    """Percent reduction of treatment relative to control (positive = treatment cheaper)."""
-    if treatment is None or control is None or control == 0:
-        return None
-    return round((control - treatment) / control * 100, 1)
-
-
-def aggregate_samples(samples):
-    treatment = [s for s in samples if s.get("arm") == "treatment"]
-    control = [s for s in samples if s.get("arm") == "control"]
-    t_stats = _arm_stats(treatment)
-    c_stats = _arm_stats(control)
-    return {
-        "treatment": t_stats,
-        "control": c_stats,
-        "savings": {
-            "cost_pct": _pct_lower(t_stats["cost_usd_mean"], c_stats["cost_usd_mean"]),
-            "tool_calls_pct": _pct_lower(t_stats["tool_calls_mean"], c_stats["tool_calls_mean"]),
-            "duration_pct": _pct_lower(t_stats["duration_ms_mean"], c_stats["duration_ms_mean"]),
-        },
-    }
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_aggregate.py -v`
-Expected: PASS (4 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/aggregate.py tests/benchmark/test_aggregate.py
-git commit -m "feat(benchmark): per-arm aggregation + savings percentages"
-```
-
----
-
-### Task 8: Supabase store + offline fallback
-
-**Files:**
-- Create: `archie/benchmark/store.py`
-- Test: `tests/benchmark/test_store.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_store.py
-import json
-from archie.benchmark import store
-
-
-def test_offline_fallback_when_env_missing(tmp_path, monkeypatch):
-    monkeypatch.delenv("SUPABASE_URL", raising=False)
-    monkeypatch.delenv("SUPABASE_SERVICE_KEY", raising=False)
-    out = tmp_path / "nested" / "results.json"
-    res = store.store_results({"name": "x"}, [{"arm": "treatment"}], out)
-    assert res["mode"] == "offline"
-    saved = json.loads(out.read_text())
-    assert saved["run"]["name"] == "x"
-    assert saved["samples"][0]["arm"] == "treatment"
-
-
-def test_online_write_posts_run_then_samples(tmp_path, monkeypatch):
-    monkeypatch.setenv("SUPABASE_URL", "https://x.supabase.co")
-    monkeypatch.setenv("SUPABASE_SERVICE_KEY", "secret")
-    calls = []
-
-    def fake_poster(url, key, table, rows):
-        calls.append((table, rows))
-        if table == "benchmark_runs":
-            return [{"id": "run-123"}]
-        return rows
-
-    res = store.store_results({"name": "x"}, [{"arm": "treatment"}, {"arm": "control"}],
-                              tmp_path / "r.json", _poster=fake_poster)
-    assert res["mode"] == "online"
-    assert res["run_id"] == "run-123"
-    assert calls[0][0] == "benchmark_runs"
-    assert calls[1][0] == "benchmark_samples"
-    # run_id stamped onto every sample row
-    assert all(r["run_id"] == "run-123" for r in calls[1][1])
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_store.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.store'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/store.py
-import json
-import os
-import urllib.request
-from pathlib import Path
-
-
-def _env():
-    return os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_SERVICE_KEY")
-
-
-def _post(url, key, table, rows):
-    data = json.dumps(rows).encode("utf-8")
-    req = urllib.request.Request(
-        f"{url}/rest/v1/{table}",
-        data=data,
-        headers={
-            "apikey": key,
-            "Authorization": f"Bearer {key}",
-            "Content-Type": "application/json",
-            "Prefer": "return=representation",
-        },
-        method="POST",
-    )
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return json.loads(resp.read().decode("utf-8"))
-
-
-def store_results(run_row, sample_rows, offline_path, _poster=None):
-    url, key = _env()
-    if not url or not key:
-        path = Path(offline_path)
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text(json.dumps({"run": run_row, "samples": sample_rows}, indent=2))
-        return {"mode": "offline", "path": str(path)}
-
-    poster = _poster or _post
-    created = poster(url, key, "benchmark_runs", [run_row])
-    run_id = created[0]["id"]
-    for r in sample_rows:
-        r["run_id"] = run_id
-    poster(url, key, "benchmark_samples", sample_rows)
-    return {"mode": "online", "run_id": run_id}
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_store.py -v`
-Expected: PASS (2 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/store.py tests/benchmark/test_store.py
-git commit -m "feat(benchmark): Supabase PostgREST write with offline fallback"
-```
-
----
-
-### Task 9: Supabase schema DDL
-
-**Files:**
-- Create: `archie/benchmark/schema.sql`
-- Test: `tests/benchmark/test_schema.py`
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_schema.py
-from pathlib import Path
-
-SQL = Path(__file__).parent.parent.parent / "archie" / "benchmark" / "schema.sql"
-
-
-def test_schema_defines_both_tables_and_view():
-    text = SQL.read_text()
-    assert "create table" in text.lower()
-    assert "benchmark_runs" in text
-    assert "benchmark_samples" in text
-    assert "benchmark_summary" in text
-    # key sample columns referenced by store.py / aggregate.py exist
-    for col in ["tool_calls", "tool_breakdown", "cost_usd", "quality_score",
-                "cache_read_tokens", "judge_seed", "completed", "arm"]:
-        assert col in text
-    # prep cost lives on the run, separate from measured samples
-    assert "prep_cost_usd" in text
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_schema.py -v`
-Expected: FAIL with `FileNotFoundError` (schema.sql does not exist)
-
-- [ ] **Step 3: Write minimal implementation**
-
-```sql
--- archie/benchmark/schema.sql
--- Archie benchmark harness — Supabase schema (v1).
--- Run manually against the project (or via CI). Idempotent-ish: uses IF NOT EXISTS.
-
-create table if not exists benchmark_runs (
-    id              uuid primary key default gen_random_uuid(),
-    name            text not null,
-    repo_name       text,                 -- basename only, never a full path
-    task_prompt     text,
-    model           text,
-    judge_model     text,
-    repetitions     int,
-    git_base_commit text,
-    prep_cost_usd   numeric,              -- deep-scan prep cost, separate & best-effort
-    archie_version  text,
-    created_at      timestamptz not null default now()
-);
-
-create table if not exists benchmark_samples (
-    id                    uuid primary key default gen_random_uuid(),
-    run_id                uuid not null references benchmark_runs(id) on delete cascade,
-    arm                   text not null,  -- 'control' | 'treatment'
-    repetition            int,
-    tool_calls            int,
-    tool_breakdown        jsonb,
-    input_tokens          int,
-    output_tokens         int,
-    cache_read_tokens     int,
-    cache_creation_tokens int,
-    cost_usd              numeric,
-    duration_ms           int,
-    num_turns             int,
-    completed             boolean,
-    quality_score         numeric,
-    quality_detail        jsonb,
-    judge_seed            int,
-    created_at            timestamptz not null default now()
-);
-
-create index if not exists benchmark_samples_run_id_idx on benchmark_samples(run_id);
-
--- Per-run, per-arm rollup the website reads (separate spec).
-create or replace view benchmark_summary as
-select
-    r.id            as run_id,
-    r.name          as name,
-    r.repo_name     as repo_name,
-    r.model         as model,
-    s.arm           as arm,
-    count(*)                          as samples,
-    count(*) filter (where s.completed) as completed_samples,
-    avg(s.tool_calls)                 as tool_calls_mean,
-    avg(s.cost_usd)                   as cost_usd_mean,
-    avg(s.duration_ms)                as duration_ms_mean,
-    avg(s.input_tokens + s.output_tokens) as total_tokens_mean,
-    avg(s.quality_score)              as quality_mean
-from benchmark_runs r
-join benchmark_samples s on s.run_id = r.id
-group by r.id, r.name, r.repo_name, r.model, s.arm;
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_schema.py -v`
-Expected: PASS (1 case)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/schema.sql tests/benchmark/test_schema.py
-git commit -m "feat(benchmark): Supabase schema (runs + samples tables, summary view)"
-```
-
----
-
-### Task 10: Orchestrator — measurement matrix + fairness guards
-
-**Files:**
-- Create: `archie/benchmark/orchestrator.py`
-- Test: `tests/benchmark/test_orchestrator.py`
-
-This task assumes both branches already exist (the `run` command). Branch prep (`auto`) is Task 11. The orchestrator accepts injectable `run_fn`, `judge_fn`, `store_fn`, and `diff_fn` so the matrix is testable without invoking real claude/git/Supabase.
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_orchestrator.py
-import pytest
-from archie.benchmark.config import BenchmarkConfig, JudgeConfig
-from archie.benchmark.metrics import SampleMetrics
-from archie.benchmark import orchestrator
-
-
-def _cfg(tmp_path, reps=2):
-    return BenchmarkConfig(
-        name="demo", repo=tmp_path, task_prompt="do it",
-        model="m", branches={"treatment": "t", "control": "c"},
-        repetitions=reps, judge=JudgeConfig(model="jm", rubric=["correctness"]),
-        timeout_seconds=60,
-    )
-
-
-def _fake_run(metrics_by_branch):
-    seen = {"calls": []}
-
-    def run_fn(prompt, model, cwd, timeout):
-        # branch name is encoded in the worktree path by the orchestrator
-        branch = "treatment" if "treatment" in str(cwd) else "control"
-        seen["calls"].append((branch, prompt, model))
-        return metrics_by_branch[branch], "raw"
-
-    return run_fn, seen
-
-
-def test_run_benchmark_builds_matrix_and_aggregates(tmp_path, monkeypatch):
-    # neutralize real worktree/diff/base-commit side effects
-    monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "abc123")
-    monkeypatch.setattr(orchestrator, "_branch_base", lambda repo, b: "abc123")
-
-    import contextlib
-    @contextlib.contextmanager
-    def fake_worktree(repo, branch, dest):
-        yield tmp_path / ("wt-" + branch)
-    monkeypatch.setattr(orchestrator, "worktree", fake_worktree)
-    monkeypatch.setattr(orchestrator, "prune", lambda repo: None)
-
-    t_metrics = SampleMetrics(tool_calls=5, cost_usd=1.0, duration_ms=100,
-                              input_tokens=10, output_tokens=20, completed=True)
-    c_metrics = SampleMetrics(tool_calls=12, cost_usd=3.0, duration_ms=300,
-                              input_tokens=30, output_tokens=40, completed=True)
-    run_fn, seen = _fake_run({"treatment": t_metrics, "control": c_metrics})
-
-    judged = {"calls": 0}
-    def judge_fn(task, t_diff, c_diff, rubric, model, seed):
-        judged["calls"] += 1
-        return {"treatment": {"overall": 9.0}, "control": {"overall": 5.0}, "seed": seed}
-
-    stored = {}
-    def store_fn(run_row, sample_rows, offline_path):
-        stored["run"] = run_row
-        stored["samples"] = sample_rows
-        return {"mode": "offline", "path": str(offline_path)}
-
-    result = orchestrator.run_benchmark(
-        _cfg(tmp_path, reps=2),
-        run_fn=run_fn, judge_fn=judge_fn, store_fn=store_fn,
-        diff_fn=lambda wt: f"diff:{wt}",
-    )
-
-    # 2 reps x 2 arms = 4 runs; 2 reps = 2 pairwise judge calls
-    assert len(seen["calls"]) == 4
-    assert judged["calls"] == 2
-    assert len(stored["samples"]) == 4
-    # quality assigned per arm
-    t_samples = [s for s in stored["samples"] if s["arm"] == "treatment"]
-    assert all(s["quality_score"] == 9.0 for s in t_samples)
-    # aggregate shows treatment cheaper
-    assert result["aggregate"]["savings"]["cost_pct"] > 0
-    # prompt identical across all runs
-    assert len({c[1] for c in seen["calls"]}) == 1
-
-
-def test_fairness_guard_rejects_divergent_base(tmp_path, monkeypatch):
-    monkeypatch.setattr(orchestrator, "_branch_base",
-                        lambda repo, b: "AAA" if b == "t" else "BBB")
-    with pytest.raises(ValueError, match="base commit"):
-        orchestrator.run_benchmark(_cfg(tmp_path), run_fn=lambda *a: None,
-                                   judge_fn=lambda *a, **k: None,
-                                   store_fn=lambda *a: None, diff_fn=lambda w: "")
-
-
-def test_failed_sample_does_not_sink_run(tmp_path, monkeypatch):
-    monkeypatch.setattr(orchestrator, "_branch_base", lambda repo, b: "same")
-    import contextlib
-    @contextlib.contextmanager
-    def fake_worktree(repo, branch, dest):
-        yield tmp_path / ("wt-" + branch)
-    monkeypatch.setattr(orchestrator, "worktree", fake_worktree)
-    monkeypatch.setattr(orchestrator, "prune", lambda repo: None)
-
-    def run_fn(prompt, model, cwd, timeout):
-        if "treatment" in str(cwd):
-            raise RuntimeError("treatment crashed")
-        return SampleMetrics(completed=True, cost_usd=2.0), "raw"
-
-    stored = {}
-    result = orchestrator.run_benchmark(
-        _cfg(tmp_path, reps=1),
-        run_fn=run_fn,
-        judge_fn=lambda *a, **k: {"treatment": {"overall": 0}, "control": {"overall": 5}, "seed": 0},
-        store_fn=lambda r, s, p: stored.update(samples=s) or {"mode": "offline"},
-        diff_fn=lambda wt: "",
-    )
-    # treatment sample recorded as not-completed; control still present
-    arms = {s["arm"]: s for s in stored["samples"]}
-    assert arms["treatment"]["completed"] is False
-    assert arms["control"]["completed"] is True
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_orchestrator.py -v`
-Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.orchestrator'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-```python
-# archie/benchmark/orchestrator.py
-import subprocess
-import hashlib
-from pathlib import Path
-
-from .isolation import worktree, prune
-from .diff import capture_diff
-from .runner import run_claude
-from .judge import run_judge
-from .store import store_results
-from .aggregate import aggregate_samples
-from .metrics import SampleMetrics
-
-
-def _git_out(args, cwd):
-    return subprocess.run(["git", *args], cwd=str(cwd), check=True,
-                          capture_output=True, text=True).stdout.strip()
-
-
-def _base_commit(repo):
-    return _git_out(["rev-parse", "HEAD"], repo)
-
-
-def _branch_base(repo, branch):
-    """The commit the branch resolves to (used to verify both arms share a base)."""
-    return _git_out(["rev-parse", branch], repo)
-
-
-def _seed(name, repetition):
-    h = hashlib.sha256(f"{name}:{repetition}".encode("utf-8")).hexdigest()
-    return int(h[:8], 16)
-
-
-def _worktrees_root(repo):
-    root = Path(repo) / ".archie" / "benchmark" / "worktrees"
-    root.mkdir(parents=True, exist_ok=True)
-    return root
-
-
-def _run_one(cfg, branch, repetition, run_fn, diff_fn):
-    """Run a single (branch, repetition) sample; return (metrics, diff)."""
-    root = _worktrees_root(cfg.repo)
-    dest = root / f"{branch.replace('/', '_')}-{repetition}"
-    with worktree(cfg.repo, branch, dest) as wt:
-        try:
-            metrics, _raw = run_fn(cfg.task_prompt, cfg.model, wt, cfg.timeout_seconds)
-        except Exception:
-            return SampleMetrics(completed=False), ""
-        diff = diff_fn(wt)
-        return metrics, diff
-
-
-def _sample_row(arm, repetition, metrics, quality_score, quality_detail, seed):
-    return {
-        "arm": arm,
-        "repetition": repetition,
-        "tool_calls": metrics.tool_calls,
-        "tool_breakdown": metrics.tool_breakdown,
-        "input_tokens": metrics.input_tokens,
-        "output_tokens": metrics.output_tokens,
-        "cache_read_tokens": metrics.cache_read_tokens,
-        "cache_creation_tokens": metrics.cache_creation_tokens,
-        "cost_usd": metrics.cost_usd,
-        "duration_ms": metrics.duration_ms,
-        "num_turns": metrics.num_turns,
-        "completed": metrics.completed,
-        "quality_score": quality_score,
-        "quality_detail": quality_detail,
-        "judge_seed": seed,
-    }
-
-
-def run_benchmark(cfg, run_fn=run_claude, judge_fn=run_judge,
-                  store_fn=store_results, diff_fn=capture_diff):
-    # Fairness guard: both arms must descend from the same base commit.
-    t_base = _branch_base(cfg.repo, cfg.branches["treatment"])
-    c_base = _branch_base(cfg.repo, cfg.branches["control"])
-    if t_base != c_base:
-        raise ValueError(
-            f"arms have divergent base commit (treatment={t_base}, control={c_base}); "
-            "both benchmark branches must branch from the same commit")
-
-    prune(cfg.repo)
-    samples = []
-    for rep in range(cfg.repetitions):
-        t_metrics, t_diff = _run_one(cfg, cfg.branches["treatment"], rep, run_fn, diff_fn)
-        c_metrics, c_diff = _run_one(cfg, cfg.branches["control"], rep, run_fn, diff_fn)
-
-        seed = _seed(cfg.name, rep)
-        verdict = judge_fn(cfg.task_prompt, t_diff, c_diff, cfg.judge.rubric,
-                           cfg.judge.model, seed)
-        t_q = verdict["treatment"]
-        c_q = verdict["control"]
-        samples.append(_sample_row("treatment", rep, t_metrics,
-                                    t_q.get("overall"), t_q, seed))
-        samples.append(_sample_row("control", rep, c_metrics,
-                                    c_q.get("overall"), c_q, seed))
-    prune(cfg.repo)
-
-    agg = aggregate_samples(samples)
-    run_row = {
-        "name": cfg.name,
-        "repo_name": Path(cfg.repo).name,
-        "task_prompt": cfg.task_prompt,
-        "model": cfg.model,
-        "judge_model": cfg.judge.model,
-        "repetitions": cfg.repetitions,
-        "git_base_commit": _base_commit(cfg.repo),
-        "prep_cost_usd": None,
-        "archie_version": _archie_version(),
-    }
-    offline_path = Path(cfg.repo) / ".archie" / "benchmark" / cfg.name / "results.json"
-    store_result = store_fn(run_row, samples, offline_path)
-    return {"aggregate": agg, "samples": samples, "store": store_result, "run": run_row}
-
-
-def _archie_version():
-    try:
-        from archie import __version__
-        return __version__
-    except Exception:
-        return "unknown"
-```
-
-> **Note on test `test_failed_sample_does_not_sink_run`:** `_run_one` catches the runner exception and returns `SampleMetrics(completed=False)`, so the control arm still runs and the run completes. The fairness-guard test patches `_branch_base` to return equal values via the same-string lambda; the divergent test returns different strings.
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_orchestrator.py -v`
-Expected: PASS (3 cases)
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/orchestrator.py tests/benchmark/test_orchestrator.py
-git commit -m "feat(benchmark): orchestrator matrix, fairness guard, per-sample rows"
-```
-
----
-
-### Task 11: Branch prep + CLI
-
-**Files:**
-- Modify: `archie/benchmark/orchestrator.py` (add `prepare_branches`)
-- Create: `archie/benchmark/cli.py`
-- Test: `tests/benchmark/test_prepare.py`
-
-`prepare_branches` does the pure, testable git work: verify clean tree, create control branch (stripping Archie files), create treatment branch. The interactive deep-scan pause lives in `cli.py` (calls `input()`), kept thin and out of unit tests.
-
-- [ ] **Step 1: Write the failing test**
-
-```python
-# tests/benchmark/test_prepare.py
-import subprocess
-import pytest
-from archie.benchmark.config import BenchmarkConfig, JudgeConfig
-from archie.benchmark import orchestrator as orch
-
-
-def _git(args, cwd):
-    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
-
-
-def _repo(tmp_path, with_archie):
-    repo = tmp_path / "repo"
-    repo.mkdir()
-    _git(["init"], repo)
-    _git(["config", "user.email", "t@t.t"], repo)
-    _git(["config", "user.name", "t"], repo)
-    (repo / "src.py").write_text("print('hi')\n")
-    if with_archie:
-        (repo / "CLAUDE.md").write_text("# context\n")
-        (repo / ".claude").mkdir()
-        (repo / ".claude" / "settings.json").write_text("{}\n")
-    _git(["add", "-A"], repo)
-    _git(["commit", "-m", "init"], repo)
-    return repo
-
-
-def _cfg(repo):
-    return BenchmarkConfig(name="d", repo=repo, task_prompt="x", model="m",
-                           branches={"treatment": "archie-bench/with-archie",
-                                     "control": "archie-bench/no-archie"},
-                           repetitions=1, judge=JudgeConfig(), timeout_seconds=60)
-
-
-def test_clean_tree_required(tmp_path):
-    repo = _repo(tmp_path, with_archie=True)
-    (repo / "dirty.txt").write_text("uncommitted\n")
-    with pytest.raises(ValueError, match="clean"):
-        orch.prepare_branches(_cfg(repo))
-
-
-def test_control_branch_strips_archie_files(tmp_path):
-    repo = _repo(tmp_path, with_archie=True)
-    status = orch.prepare_branches(_cfg(repo))
-    # control branch checked out: Archie files gone
-    _git(["checkout", "archie-bench/no-archie"], repo)
-    assert not (repo / "CLAUDE.md").exists()
-    assert not (repo / ".claude").exists()
-    assert (repo / "src.py").exists()
-    assert status["archie_present"] is True
-    assert status["needs_deep_scan"] is False
-
-
-def test_treatment_keeps_archie_files(tmp_path):
-    repo = _repo(tmp_path, with_archie=True)
-    orch.prepare_branches(_cfg(repo))
-    _git(["checkout", "archie-bench/with-archie"], repo)
-    assert (repo / "CLAUDE.md").exists()
-
-
-def test_no_archie_flags_deep_scan_needed(tmp_path):
-    repo = _repo(tmp_path, with_archie=False)
-    status = orch.prepare_branches(_cfg(repo))
-    assert status["archie_present"] is False
-    assert status["needs_deep_scan"] is True
-```
-
-- [ ] **Step 2: Run test to verify it fails**
-
-Run: `python -m pytest tests/benchmark/test_prepare.py -v`
-Expected: FAIL with `AttributeError: module 'archie.benchmark.orchestrator' has no attribute 'prepare_branches'`
-
-- [ ] **Step 3: Write minimal implementation**
-
-Append to `archie/benchmark/orchestrator.py`:
-
-```python
-ARCHIE_PATHS = ["CLAUDE.md", "AGENTS.md", ".claude", ".archie"]
-
-
-def _is_clean(repo):
-    out = _git_out(["status", "--porcelain"], repo)
-    return out == ""
-
-
-def _archie_present(repo):
-    return any((Path(repo) / p).exists() for p in ARCHIE_PATHS)
-
-
-def _branch_exists(repo, branch):
-    res = subprocess.run(["git", "rev-parse", "--verify", branch],
-                         cwd=str(repo), capture_output=True, text=True)
-    return res.returncode == 0
-
-
-def _create_branch(repo, branch, base):
-    if _branch_exists(repo, branch):
-        subprocess.run(["git", "branch", "-D", branch], cwd=str(repo),
-                       capture_output=True, text=True)
-    _git_out(["branch", branch, base], repo)
-
-
-def _strip_archie_on_branch(repo, branch):
-    """Check out branch, remove Archie artifacts (incl. per-folder CLAUDE.md), commit."""
-    current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], repo)
-    _git_out(["checkout", branch], repo)
-    try:
-        # remove root-level + nested CLAUDE.md and known Archie dirs/files
-        subprocess.run(["git", "rm", "-r", "--quiet", "--ignore-unmatch",
-                        *ARCHIE_PATHS], cwd=str(repo), capture_output=True, text=True)
-        # nested per-folder CLAUDE.md files
-        nested = subprocess.run(["git", "ls-files", "*/CLAUDE.md"], cwd=str(repo),
-                                capture_output=True, text=True).stdout.split()
-        if nested:
-            subprocess.run(["git", "rm", "--quiet", "--ignore-unmatch", *nested],
-                           cwd=str(repo), capture_output=True, text=True)
-        if not _is_clean(repo):
-            _git_out(["commit", "-m", "benchmark: strip Archie artifacts (control arm)"], repo)
-    finally:
-        _git_out(["checkout", current], repo)
-
-
-def prepare_branches(cfg):
-    """Create control (no Archie) and treatment (with Archie) branches from current HEAD.
-
-    Returns a status dict; if Archie is absent, `needs_deep_scan` is True and the
-    caller (cli) must run the interactive deep-scan on the treatment branch.
-    """
-    repo = cfg.repo
-    if not _is_clean(repo):
-        raise ValueError("working tree is not clean; commit or stash before benchmarking")
-
-    base = _base_commit(repo)
-    archie_present = _archie_present(repo)
-
-    _create_branch(repo, cfg.branches["treatment"], base)
-    _create_branch(repo, cfg.branches["control"], base)
-
-    if archie_present:
-        _strip_archie_on_branch(repo, cfg.branches["control"])
-    # if absent, control already has no Archie files; treatment will be populated
-    # by the interactive deep-scan (cli handles the pause).
-
-    return {
-        "archie_present": archie_present,
-        "needs_deep_scan": not archie_present,
-        "base": base,
-        "branches": cfg.branches,
-    }
-```
-
-- [ ] **Step 4: Run test to verify it passes**
-
-Run: `python -m pytest tests/benchmark/test_prepare.py -v`
-Expected: PASS (4 cases)
-
-- [ ] **Step 5: Write the CLI**
-
-```python
-# archie/benchmark/cli.py
-import argparse
-import sys
-from pathlib import Path
-
-from .config import load_config, parse_config
-from .orchestrator import run_benchmark, prepare_branches
-
-
-def _print_summary(result):
-    agg = result["aggregate"]
-    print("\n=== Benchmark summary ===")
-    for arm in ("treatment", "control"):
-        a = agg[arm]
-        print(f"[{arm}] n={a['n']} completed={a['completed_n']} "
-              f"cost=${_fmt(a['cost_usd_mean'])} tools={_fmt(a['tool_calls_mean'])} "
-              f"dur={_fmt(a['duration_ms_mean'])}ms quality={_fmt(a['quality_mean'])}")
-    s = agg["savings"]
-    print(f"[savings] cost={_fmt(s['cost_pct'])}%  tools={_fmt(s['tool_calls_pct'])}%  "
-          f"time={_fmt(s['duration_pct'])}%")
-    print(f"[store] {result['store']}")
-
-
-def _fmt(v):
-    return "n/a" if v is None else (f"{v:.2f}" if isinstance(v, float) else str(v))
-
-
-def _cmd_run(args):
-    cfg = load_config(args.config)
-    result = run_benchmark(cfg)
-    _print_summary(result)
-
-
-def _cmd_prep(args):
-    cfg = load_config(args.config)
-    status = prepare_branches(cfg)
-    if status["needs_deep_scan"]:
-        _interactive_deep_scan(cfg)
-    print(f"Branches ready: {cfg.branches}")
-
-
-def _cmd_auto(args):
-    if args.config:
-        cfg = load_config(args.config)
-    else:
-        cfg = parse_config({"name": Path(args.repo).name, "repo": args.repo,
-                            "task_prompt": args.prompt, "model": args.model})
-    status = prepare_branches(cfg)
-    if status["needs_deep_scan"]:
-        _interactive_deep_scan(cfg)
-    result = run_benchmark(cfg)
-    _print_summary(result)
-
-
-def _interactive_deep_scan(cfg):
-    treatment = cfg.branches["treatment"]
-    print("\n" + "=" * 70)
-    print("Archie not found in this repo. Semi-automatic prep:")
-    print(f"  1. In a terminal: git checkout {treatment}")
-    print(f"  2. Install Archie:  npx @bitraptors/archie {cfg.repo}")
-    print("  3. In Claude Code on that branch, run:  /archie-deep-scan")
-    print("  4. Commit the generated files.")
-    print("This deep-scan is NOT counted in the benchmark metrics.")
-    print("=" * 70)
-    input("Press Enter once the treatment branch has committed Archie files... ")
-    # verify
-    from .orchestrator import _git_out, _archie_present  # local import to avoid cycle noise
-    current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], cfg.repo)
-    _git_out(["checkout", treatment], cfg.repo)
-    present = _archie_present(cfg.repo)
-    _git_out(["checkout", current], cfg.repo)
-    if not present:
-        print("ERROR: no Archie files found on the treatment branch. Aborting.", file=sys.stderr)
-        sys.exit(1)
-
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(prog="archie-benchmark",
-                                     description="Measure Archie effectiveness (control vs treatment).")
-    sub = parser.add_subparsers(dest="command", required=True)
-
-    p_run = sub.add_parser("run", help="run benchmark on existing branches")
-    p_run.add_argument("config", help="path to benchmark config JSON")
-    p_run.set_defaults(func=_cmd_run)
-
-    p_prep = sub.add_parser("prep", help="create/refresh benchmark branches only")
-    p_prep.add_argument("config", help="path to benchmark config JSON")
-    p_prep.set_defaults(func=_cmd_prep)
-
-    p_auto = sub.add_parser("auto", help="prep branches then run, from a plain repo")
-    p_auto.add_argument("repo", nargs="?", help="repo path (when no --config)")
-    p_auto.add_argument("--config", help="path to benchmark config JSON")
-    p_auto.add_argument("--prompt", help="task prompt (when no --config)")
-    p_auto.add_argument("--model", default="claude-sonnet-4-6")
-    p_auto.set_defaults(func=_cmd_auto)
-
-    args = parser.parse_args(argv)
-    args.func(args)
-
-
-if __name__ == "__main__":
-    main()
-```
-
-- [ ] **Step 6: Add a CLI smoke test**
-
-```python
-# append to tests/benchmark/test_prepare.py
-def test_cli_run_invokes_benchmark(tmp_path, monkeypatch):
-    import json
-    from archie.benchmark import cli
-    repo = _repo(tmp_path, with_archie=True)
-    cfg_file = tmp_path / "c.json"
-    cfg_file.write_text(json.dumps({
-        "name": "d", "repo": str(repo), "task_prompt": "x", "model": "m",
-        "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"},
-    }))
-    called = {}
-    monkeypatch.setattr(cli, "run_benchmark",
-                        lambda cfg: called.setdefault("ran", True) or {
-                            "aggregate": {"treatment": {"n": 0, "completed_n": 0,
-                                "cost_usd_mean": None, "tool_calls_mean": None,
-                                "duration_ms_mean": None, "quality_mean": None},
-                                "control": {"n": 0, "completed_n": 0, "cost_usd_mean": None,
-                                "tool_calls_mean": None, "duration_ms_mean": None,
-                                "quality_mean": None},
-                                "savings": {"cost_pct": None, "tool_calls_pct": None,
-                                "duration_pct": None}},
-                            "store": {"mode": "offline"}})
-    cli.main(["run", str(cfg_file)])
-    assert called["ran"] is True
-```
-
-- [ ] **Step 7: Run all benchmark tests**
-
-Run: `python -m pytest tests/benchmark/ -v`
-Expected: PASS (all tasks' tests green)
-
-- [ ] **Step 8: Commit**
-
-```bash
-git add archie/benchmark/orchestrator.py archie/benchmark/cli.py tests/benchmark/test_prepare.py
-git commit -m "feat(benchmark): branch prep (strip Archie for control) + CLI (run/prep/auto)"
-```
-
----
-
-### Task 12: Docs + full suite
-
-**Files:**
-- Create: `archie/benchmark/README.md`
-- Modify: `CLAUDE.md` (add a short "Benchmark Harness" section under Commands)
-
-- [ ] **Step 1: Write `archie/benchmark/README.md`**
-
-```markdown
-# Archie Benchmark Harness (internal)
-
-Measures Archie's effectiveness: runs the **same** task headlessly on a control
-branch (no Archie) and a treatment branch (full Archie docs + hooks), capturing
-tool calls / tokens / cost / time + a blind judge-Claude quality score, and writes
-results to Supabase. **Not** shipped via npm.
-
-## Usage
-
-```bash
-# 1. Author a config (see example below) — JSON, zero-dep.
-# 2. From a plain repo, prep branches then run:
-python3 -m archie.benchmark auto /path/to/repo --prompt "Add a sleep timer feature"
-
-# Or with a config file:
-python3 -m archie.benchmark run config.json     # branches must already exist
-python3 -m archie.benchmark prep config.json    # only create/refresh branches
-```
-
-If the repo has no Archie files yet, `auto`/`prep` create the branches, then pause
-so you can run `/archie-deep-scan` interactively on the treatment branch. That
-deep-scan is **never** counted in the measured metrics.
-
-## Config
-
-```json
-{
-  "name": "bedtime-add-sleep-timer",
-  "repo": "/Users/you/DEV/BedtimeApp",
-  "task_prompt": "Add a sleep timer feature ...",
-  "model": "claude-sonnet-4-6",
-  "repetitions": 3,
-  "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"},
-  "judge": {"model": "claude-opus-4-8", "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"]},
-  "timeout_seconds": 3600
-}
-```
-
-## Supabase
-
-Set `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` in the environment. Without them the
-harness writes `.archie/benchmark/<name>/results.json` locally (offline mode).
-Apply `archie/benchmark/schema.sql` to the project once.
-
-## Fairness invariants
-
-- Identical `task_prompt`, `model`, and harness flags on both arms.
-- Both branches descend from the same base commit (enforced).
-- Deep-scan prep cost is separate (`prep_cost_usd`), never in sample metrics.
-```
-
-- [ ] **Step 2: Add a section to `CLAUDE.md`** (under "Commands", after "Tests")
-
-```markdown
-### Benchmark Harness (internal)
-```bash
-# Measure Archie effectiveness: same task, control (no Archie) vs treatment (full Archie)
-python3 -m archie.benchmark auto /path/to/repo --prompt "..."   # prep + run from a plain repo
-python3 -m archie.benchmark run config.json                     # run on existing branches
-```
-Internal-only (not shipped via npm). Captures tool calls / tokens / cost / time +
-blind judge-Claude quality, writes to Supabase (`benchmark_runs`, `benchmark_samples`).
-See `archie/benchmark/README.md`.
-```
-
-- [ ] **Step 3: Run the full project test suite**
-
-Run: `python -m pytest tests/ -v`
-Expected: PASS (existing tests + all `tests/benchmark/` tests)
-
-- [ ] **Step 4: Run the sync checker** (benchmark is internal, so it must NOT trip sync)
-
-Run: `python3 scripts/verify_sync.py`
-Expected: PASS — confirms no accidental npm-package coupling
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add archie/benchmark/README.md CLAUDE.md
-git commit -m "docs(benchmark): README + CLAUDE.md usage section"
-```
-
----
-
-## Self-Review Notes (completed by plan author)
-
-- **Spec coverage:** §1 purpose → Tasks 5/6/7/10; §4 config → Task 1; §5 data flow → Tasks 2/3/4/5/10; §6 auto/prep → Task 11; §7 judge → Task 6; §8 schema/store → Tasks 8/9; §9 error/fairness/cleanup → Tasks 4/10/11; §10 testing → every task is TDD. All covered.
-- **Type consistency:** `SampleMetrics` fields, judge result dict keys (`treatment`/`control`/`overall`), and sample-row keys match across `metrics.py`, `judge.py`, `orchestrator._sample_row`, `aggregate.py`, `store.py`, and `schema.sql`.
-- **Out of scope (per spec §11):** website display, end-user shipped mode, headless deep-scan, Agent SDK — none included.
-- **Open implementation note:** `prep_cost_usd` stays `None` in v1 (best-effort per spec); a follow-up can read `.archie/telemetry/` after the interactive deep-scan to populate it.
-```
diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/benchmark/test_aggregate.py b/tests/benchmark/test_aggregate.py
new file mode 100644
index 00000000..81f36b41
--- /dev/null
+++ b/tests/benchmark/test_aggregate.py
@@ -0,0 +1,74 @@
+from archie.benchmark.aggregate import aggregate_samples
+
+
+def _s(arm, cost, tools, quality, completed=True):
+    return {"arm": arm, "cost_usd": cost, "tool_calls": tools,
+            "duration_ms": 1000, "input_tokens": 10, "output_tokens": 20,
+            "quality_score": quality, "completed": completed}
+
+
+def test_per_arm_means():
+    samples = [
+        _s("treatment", 1.0, 10, 8.0),
+        _s("treatment", 3.0, 20, 9.0),
+        _s("control", 2.0, 30, 6.0),
+        _s("control", 4.0, 40, 7.0),
+    ]
+    agg = aggregate_samples(samples)
+    assert agg["treatment"]["cost_usd_mean"] == 2.0
+    assert agg["treatment"]["tool_calls_mean"] == 15.0
+    assert agg["treatment"]["quality_mean"] == 8.5
+    assert agg["control"]["cost_usd_mean"] == 3.0
+    assert agg["treatment"]["n"] == 2
+    assert agg["treatment"]["completed_n"] == 2
+
+
+def test_savings_percentages():
+    samples = [_s("treatment", 1.0, 10, 8.0), _s("control", 2.0, 20, 8.0)]
+    agg = aggregate_samples(samples)
+    # treatment cost is 50% lower than control
+    assert agg["savings"]["cost_pct"] == 50.0
+    assert agg["savings"]["tool_calls_pct"] == 50.0
+
+
+def test_quality_ignores_none_scores():
+    samples = [
+        _s("treatment", 1.0, 10, None, completed=False),
+        _s("treatment", 1.0, 10, 8.0),
+        _s("control", 1.0, 10, 6.0),
+    ]
+    agg = aggregate_samples(samples)
+    assert agg["treatment"]["quality_mean"] == 8.0  # None excluded
+    assert agg["treatment"]["completed_n"] == 1
+
+
+def test_handles_empty_arm():
+    samples = [_s("treatment", 1.0, 10, 8.0)]
+    agg = aggregate_samples(samples)
+    assert agg["control"]["n"] == 0
+    assert agg["control"]["cost_usd_mean"] is None
+    assert agg["savings"]["cost_pct"] is None
+
+
+def test_attempted_n_and_quality_excludes_not_attempted():
+    samples = [
+        {"arm": "treatment", "cost_usd": 1.0, "tool_calls": 9, "duration_ms": 100,
+         "input_tokens": 1, "output_tokens": 1, "quality_score": 8.0,
+         "completed": True, "attempted": True},
+        {"arm": "control", "cost_usd": 0.5, "tool_calls": 28, "duration_ms": 100,
+         "input_tokens": 1, "output_tokens": 1, "quality_score": 1.0,
+         "completed": True, "attempted": False},
+    ]
+    agg = aggregate_samples(samples)
+    assert agg["treatment"]["attempted_n"] == 1
+    assert agg["control"]["attempted_n"] == 0
+    # control's q1 came from an empty diff -> excluded from quality_mean
+    assert agg["treatment"]["quality_mean"] == 8.0
+    assert agg["control"]["quality_mean"] is None
+
+
+def test_attempted_defaults_true_for_legacy_samples():
+    # samples without an explicit 'attempted' key count as attempted (back-compat)
+    agg = aggregate_samples([_s("treatment", 1.0, 10, 8.0)])
+    assert agg["treatment"]["attempted_n"] == 1
+    assert agg["treatment"]["quality_mean"] == 8.0
diff --git a/tests/benchmark/test_config.py b/tests/benchmark/test_config.py
new file mode 100644
index 00000000..68bd72fa
--- /dev/null
+++ b/tests/benchmark/test_config.py
@@ -0,0 +1,70 @@
+# tests/benchmark/test_config.py
+import json
+import pytest
+from pathlib import Path
+from archie.benchmark.config import parse_config, load_config, BenchmarkConfig
+
+
+def _valid():
+    return {
+        "name": "demo",
+        "repo": "/tmp/repo",
+        "task_prompt": "Add a feature",
+        "model": "claude-sonnet-4-6",
+    }
+
+
+def test_parse_minimal_applies_defaults():
+    cfg = parse_config(_valid())
+    assert isinstance(cfg, BenchmarkConfig)
+    assert cfg.repetitions == 3
+    assert cfg.timeout_seconds == 3600
+    assert cfg.branches == {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}
+    assert cfg.judge.model == "claude-opus-4-8"
+    assert "correctness" in cfg.judge.rubric
+    assert isinstance(cfg.repo, Path)
+
+
+def test_parse_overrides():
+    data = _valid()
+    data.update({
+        "repetitions": 5,
+        "timeout_seconds": 1200,
+        "branches": {"treatment": "t", "control": "c"},
+        "judge": {"model": "m", "rubric": ["x"]},
+    })
+    cfg = parse_config(data)
+    assert cfg.repetitions == 5
+    assert cfg.timeout_seconds == 1200
+    assert cfg.branches == {"treatment": "t", "control": "c"}
+    assert cfg.judge.model == "m"
+    assert cfg.judge.rubric == ["x"]
+
+
+@pytest.mark.parametrize("missing", ["name", "repo", "task_prompt", "model"])
+def test_missing_required_raises(missing):
+    data = _valid()
+    del data[missing]
+    with pytest.raises(ValueError, match="required"):
+        parse_config(data)
+
+
+def test_repetitions_must_be_positive():
+    data = _valid()
+    data["repetitions"] = 0
+    with pytest.raises(ValueError, match="repetitions"):
+        parse_config(data)
+
+
+def test_branches_missing_arm_raises():
+    data = _valid()
+    data["branches"] = {"treatment": "t"}
+    with pytest.raises(ValueError, match="control"):
+        parse_config(data)
+
+
+def test_load_config_reads_file(tmp_path):
+    p = tmp_path / "c.json"
+    p.write_text(json.dumps(_valid()))
+    cfg = load_config(p)
+    assert cfg.name == "demo"
diff --git a/tests/benchmark/test_diff.py b/tests/benchmark/test_diff.py
new file mode 100644
index 00000000..7d0891c8
--- /dev/null
+++ b/tests/benchmark/test_diff.py
@@ -0,0 +1,61 @@
+import subprocess
+from archie.benchmark.diff import capture_diff
+
+
+def _git(args, cwd):
+    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
+
+
+def _init_repo(path):
+    _git(["init"], path)
+    _git(["config", "user.email", "t@t.t"], path)
+    _git(["config", "user.name", "t"], path)
+    (path / "a.txt").write_text("one\n")
+    _git(["add", "-A"], path)
+    _git(["commit", "-m", "init"], path)
+
+
+def test_captures_modified_and_untracked(tmp_path):
+    _init_repo(tmp_path)
+    (tmp_path / "a.txt").write_text("one\ntwo\n")      # modified, tracked
+    (tmp_path / "b.txt").write_text("new file\n")       # untracked
+    diff = capture_diff(tmp_path)
+    assert "a.txt" in diff
+    assert "two" in diff
+    assert "b.txt" in diff
+    assert "new file" in diff
+
+
+def test_empty_diff_when_no_changes(tmp_path):
+    _init_repo(tmp_path)
+    diff = capture_diff(tmp_path)
+    assert diff.strip() == ""
+
+
+def test_excludes_build_and_cache_noise(tmp_path):
+    _init_repo(tmp_path)
+    (tmp_path / "calc.py").write_text("def f():\n    return 1\n")   # real change
+    # universal build/cache noise (no .gitignore in this repo):
+    pyc_dir = tmp_path / "__pycache__"
+    pyc_dir.mkdir()
+    (pyc_dir / "calc.cpython-311.pyc").write_text("BYTECODE")
+    (tmp_path / ".DS_Store").write_text("junk")
+    nm = tmp_path / "node_modules" / "left-pad"
+    nm.mkdir(parents=True)
+    (nm / "index.js").write_text("module.exports = 1\n")
+
+    diff = capture_diff(tmp_path)
+    # real source change is present
+    assert "calc.py" in diff
+    # noise is excluded
+    assert "__pycache__" not in diff
+    assert "calc.cpython-311.pyc" not in diff
+    assert ".DS_Store" not in diff
+    assert "node_modules" not in diff
+
+
+def test_still_includes_plain_untracked(tmp_path):
+    _init_repo(tmp_path)
+    (tmp_path / "b.txt").write_text("new file\n")
+    diff = capture_diff(tmp_path)
+    assert "b.txt" in diff
diff --git a/tests/benchmark/test_isolation.py b/tests/benchmark/test_isolation.py
new file mode 100644
index 00000000..7fff32b7
--- /dev/null
+++ b/tests/benchmark/test_isolation.py
@@ -0,0 +1,49 @@
+# tests/benchmark/test_isolation.py
+import subprocess
+from pathlib import Path
+from archie.benchmark.isolation import worktree, prune
+
+
+def _git(args, cwd):
+    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
+
+
+def _init_repo(path):
+    _git(["init"], path)
+    _git(["config", "user.email", "t@t.t"], path)
+    _git(["config", "user.name", "t"], path)
+    (path / "a.txt").write_text("one\n")
+    _git(["add", "-A"], path)
+    _git(["commit", "-m", "init"], path)
+    _git(["branch", "feature"], path)
+
+
+def test_worktree_created_and_removed(tmp_path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    _init_repo(repo)
+    dest = tmp_path / "wt"
+    with worktree(repo, "feature", dest) as wt:
+        assert Path(wt).exists()
+        assert (Path(wt) / "a.txt").exists()
+    assert not Path(dest).exists()
+
+
+def test_worktree_removed_on_exception(tmp_path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    _init_repo(repo)
+    dest = tmp_path / "wt"
+    try:
+        with worktree(repo, "feature", dest):
+            raise RuntimeError("boom")
+    except RuntimeError:
+        pass
+    assert not Path(dest).exists()
+
+
+def test_prune_runs_without_error(tmp_path):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    _init_repo(repo)
+    prune(repo)  # must not raise
diff --git a/tests/benchmark/test_judge.py b/tests/benchmark/test_judge.py
new file mode 100644
index 00000000..4351ee7f
--- /dev/null
+++ b/tests/benchmark/test_judge.py
@@ -0,0 +1,68 @@
+# tests/benchmark/test_judge.py
+import json
+import pytest
+from archie.benchmark import judge
+
+
+def test_assign_order_is_seed_deterministic():
+    assert judge.assign_order(0) == ("a", "b")
+    assert judge.assign_order(2) == ("a", "b")
+    assert judge.assign_order(1) == ("b", "a")
+    assert judge.assign_order(3) == ("b", "a")
+
+
+def test_parse_judge_output_extracts_embedded_json():
+    text = 'Here is my verdict:\n{"variant_a": {"overall": 8}, "variant_b": {"overall": 5}}\nThanks'
+    parsed = judge.parse_judge_output(text)
+    assert parsed["variant_a"]["overall"] == 8
+
+
+def test_parse_judge_output_raises_without_json():
+    with pytest.raises(ValueError, match="JSON"):
+        judge.parse_judge_output("no json here")
+
+
+def test_run_judge_maps_variants_to_arms_seed_even():
+    # seed even -> treatment is variant_a
+    payload = json.dumps({"variant_a": {"overall": 9}, "variant_b": {"overall": 4}})
+    calls = []
+
+    def fake_runner(prompt, model, timeout):
+        calls.append((prompt, model))
+        return payload
+
+    result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF",
+                             rubric=["correctness"], model="m", seed=0,
+                             _runner=fake_runner)
+    assert result["treatment"]["overall"] == 9
+    assert result["control"]["overall"] == 4
+    assert result["seed"] == 0
+    # variant A diff (shown first) must be the treatment diff for an even seed
+    assert calls[0][0].index("TREAT_DIFF") < calls[0][0].index("CTRL_DIFF")
+
+
+def test_run_judge_maps_variants_to_arms_seed_odd():
+    # seed odd -> treatment is variant_b
+    payload = json.dumps({"variant_a": {"overall": 3}, "variant_b": {"overall": 7}})
+    result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF",
+                             rubric=["correctness"], model="m", seed=1,
+                             _runner=lambda p, m, t: payload)
+    assert result["treatment"]["overall"] == 7
+    assert result["control"]["overall"] == 3
+
+
+def test_run_judge_retries_once_on_bad_json():
+    outputs = ["garbage", json.dumps({"variant_a": {"overall": 6}, "variant_b": {"overall": 6}})]
+
+    def flaky(prompt, model, timeout):
+        return outputs.pop(0)
+
+    result = judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0, _runner=flaky)
+    assert result["treatment"]["overall"] == 6
+    assert outputs == []  # both outputs consumed -> retried exactly once
+
+
+def test_run_judge_raises_after_two_failures():
+    with pytest.raises(ValueError):
+        judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0,
+                        _runner=lambda p, m, t: "still garbage")
diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py
new file mode 100644
index 00000000..5362be11
--- /dev/null
+++ b/tests/benchmark/test_metrics.py
@@ -0,0 +1,74 @@
+# tests/benchmark/test_metrics.py
+import json
+from archie.benchmark.metrics import parse_stream, SampleMetrics
+
+
+def _assistant(blocks):
+    return json.dumps({"type": "assistant", "message": {"content": blocks}})
+
+
+def _tool_use(name):
+    return {"type": "tool_use", "name": name, "id": "x", "input": {}}
+
+
+def _result(subtype="success"):
+    return json.dumps({
+        "type": "result",
+        "subtype": subtype,
+        "total_cost_usd": 0.1234,
+        "duration_ms": 5000,
+        "num_turns": 7,
+        "usage": {
+            "input_tokens": 100,
+            "output_tokens": 200,
+            "cache_read_input_tokens": 50,
+            "cache_creation_input_tokens": 25,
+        },
+    })
+
+
+def test_counts_tools_and_breakdown():
+    lines = [
+        json.dumps({"type": "system", "subtype": "init"}),
+        _assistant([{"type": "text", "text": "hi"}, _tool_use("Read")]),
+        _assistant([_tool_use("Edit"), _tool_use("Edit")]),
+        _result(),
+    ]
+    m = parse_stream(lines)
+    assert m.tool_calls == 3
+    assert m.tool_breakdown == {"Read": 1, "Edit": 2}
+
+
+def test_extracts_result_fields():
+    m = parse_stream([_result()])
+    assert m.input_tokens == 100
+    assert m.output_tokens == 200
+    assert m.cache_read_tokens == 50
+    assert m.cache_creation_tokens == 25
+    assert m.cost_usd == 0.1234
+    assert m.duration_ms == 5000
+    assert m.num_turns == 7
+    assert m.completed is True
+
+
+def test_error_result_not_completed():
+    m = parse_stream([_result(subtype="error_max_turns")])
+    assert m.completed is False
+
+
+def test_zero_tool_run():
+    m = parse_stream([_assistant([{"type": "text", "text": "done"}]), _result()])
+    assert m.tool_calls == 0
+    assert m.tool_breakdown == {}
+
+
+def test_ignores_blank_and_malformed_lines():
+    m = parse_stream(["", "  ", "not json", _result()])
+    assert m.completed is True
+
+
+def test_no_result_event_defaults():
+    m = parse_stream([_assistant([_tool_use("Bash")])])
+    assert m.tool_calls == 1
+    assert m.completed is False
+    assert m.cost_usd == 0.0
diff --git a/tests/benchmark/test_orchestrator.py b/tests/benchmark/test_orchestrator.py
new file mode 100644
index 00000000..1beb3a12
--- /dev/null
+++ b/tests/benchmark/test_orchestrator.py
@@ -0,0 +1,214 @@
+# tests/benchmark/test_orchestrator.py
+import subprocess
+import pytest
+from archie.benchmark.config import BenchmarkConfig, JudgeConfig
+from archie.benchmark.metrics import SampleMetrics
+from archie.benchmark import orchestrator
+
+
+def _cfg(tmp_path, reps=2):
+    return BenchmarkConfig(
+        name="demo", repo=tmp_path, task_prompt="do it",
+        model="m", branches={"treatment": "treatment", "control": "control"},
+        repetitions=reps, judge=JudgeConfig(model="jm", rubric=["correctness"]),
+        timeout_seconds=60,
+    )
+
+
+def _fake_run(metrics_by_branch):
+    seen = {"calls": []}
+
+    def run_fn(prompt, model, cwd, timeout):
+        # branch name is encoded in the worktree path by the orchestrator
+        branch = "treatment" if "treatment" in str(cwd) else "control"
+        seen["calls"].append((branch, prompt, model))
+        return metrics_by_branch[branch], "raw"
+
+    return run_fn, seen
+
+
+def test_run_benchmark_builds_matrix_and_aggregates(tmp_path, monkeypatch):
+    # neutralize real worktree/diff/base-commit side effects
+    monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "abc123")
+    monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "abc123")
+
+    import contextlib
+    @contextlib.contextmanager
+    def fake_worktree(repo, branch, dest):
+        yield tmp_path / ("wt-" + branch)
+    monkeypatch.setattr(orchestrator, "worktree", fake_worktree)
+    monkeypatch.setattr(orchestrator, "prune", lambda repo: None)
+
+    t_metrics = SampleMetrics(tool_calls=5, cost_usd=1.0, duration_ms=100,
+                              input_tokens=10, output_tokens=20, completed=True)
+    c_metrics = SampleMetrics(tool_calls=12, cost_usd=3.0, duration_ms=300,
+                              input_tokens=30, output_tokens=40, completed=True)
+    run_fn, seen = _fake_run({"treatment": t_metrics, "control": c_metrics})
+
+    judged = {"calls": 0}
+    def judge_fn(task, t_diff, c_diff, rubric, model, seed):
+        judged["calls"] += 1
+        return {"treatment": {"overall": 9.0}, "control": {"overall": 5.0}, "seed": seed}
+
+    stored = {}
+    def store_fn(run_row, sample_rows, offline_path):
+        stored["run"] = run_row
+        stored["samples"] = sample_rows
+        return {"mode": "offline", "path": str(offline_path)}
+
+    result = orchestrator.run_benchmark(
+        _cfg(tmp_path, reps=2),
+        run_fn=run_fn, judge_fn=judge_fn, store_fn=store_fn,
+        diff_fn=lambda wt: f"diff:{wt}",
+    )
+
+    # 2 reps x 2 arms = 4 runs; 2 reps = 2 pairwise judge calls
+    assert len(seen["calls"]) == 4
+    assert judged["calls"] == 2
+    assert len(stored["samples"]) == 4
+    # quality assigned per arm
+    t_samples = [s for s in stored["samples"] if s["arm"] == "treatment"]
+    assert all(s["quality_score"] == 9.0 for s in t_samples)
+    # aggregate shows treatment cheaper
+    assert result["aggregate"]["savings"]["cost_pct"] > 0
+    # prompt identical across all runs
+    assert len({c[1] for c in seen["calls"]}) == 1
+
+
+def test_fairness_guard_rejects_unrelated_branches(tmp_path, monkeypatch):
+    # No common ancestor -> git merge-base exits non-zero.
+    def no_common_ancestor(repo, a, b):
+        raise subprocess.CalledProcessError(128, ["git", "merge-base", a, b])
+    monkeypatch.setattr(orchestrator, "_merge_base", no_common_ancestor)
+    with pytest.raises(ValueError, match="common ancestor"):
+        orchestrator.run_benchmark(_cfg(tmp_path), run_fn=lambda *a: None,
+                                   judge_fn=lambda *a, **k: None,
+                                   store_fn=lambda *a: None, diff_fn=lambda w: "")
+
+
+def test_failed_sample_does_not_sink_run(tmp_path, monkeypatch):
+    monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "same")
+    monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "base")
+    import contextlib
+    @contextlib.contextmanager
+    def fake_worktree(repo, branch, dest):
+        yield tmp_path / ("wt-" + branch)
+    monkeypatch.setattr(orchestrator, "worktree", fake_worktree)
+    monkeypatch.setattr(orchestrator, "prune", lambda repo: None)
+
+    def run_fn(prompt, model, cwd, timeout):
+        if "treatment" in str(cwd):
+            raise RuntimeError("treatment crashed")
+        return SampleMetrics(completed=True, cost_usd=2.0), "raw"
+
+    stored = {}
+    result = orchestrator.run_benchmark(
+        _cfg(tmp_path, reps=1),
+        run_fn=run_fn,
+        judge_fn=lambda *a, **k: {"treatment": {"overall": 0}, "control": {"overall": 5}, "seed": 0},
+        store_fn=lambda r, s, p: stored.update(samples=s) or {"mode": "offline"},
+        diff_fn=lambda wt: "",
+    )
+    # treatment sample recorded as not-completed; control still present
+    arms = {s["arm"]: s for s in stored["samples"]}
+    assert arms["treatment"]["completed"] is False
+    assert arms["control"]["completed"] is True
+
+
+def test_judge_failure_does_not_sink_run(tmp_path, monkeypatch):
+    monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "same")
+    monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "base")
+    import contextlib
+    @contextlib.contextmanager
+    def fake_worktree(repo, branch, dest):
+        yield tmp_path / ("wt-" + branch)
+    monkeypatch.setattr(orchestrator, "worktree", fake_worktree)
+    monkeypatch.setattr(orchestrator, "prune", lambda repo: None)
+
+    def boom(*a, **k):
+        raise ValueError("judge returned unparseable output twice")
+
+    stored = {}
+    result = orchestrator.run_benchmark(
+        _cfg(tmp_path, reps=1),
+        run_fn=lambda p, m, cwd, t: (SampleMetrics(completed=True, cost_usd=1.0), "raw"),
+        judge_fn=boom,
+        store_fn=lambda r, s, p: stored.update(samples=s) or {"mode": "offline"},
+        diff_fn=lambda wt: "",
+    )
+    # both samples still recorded and stored, with no quality score
+    assert len(stored["samples"]) == 2
+    assert all(s["quality_score"] is None for s in stored["samples"])
+    assert all(s["quality_detail"] is None for s in stored["samples"])
+    # the run still produced an aggregate
+    assert result["aggregate"]["treatment"]["n"] == 1
+
+
+def _git(args, cwd):
+    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
+
+
+def test_prepared_branches_pass_fairness_guard(tmp_path):
+    # Regression: prepare_branches adds a strip commit to the control branch, so
+    # the two branch TIPS differ. The fairness guard must compare the merge-base
+    # (common ancestor), not the tips, and therefore must NOT raise here.
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    _git(["init"], repo)
+    _git(["config", "user.email", "t@t.t"], repo)
+    _git(["config", "user.name", "t"], repo)
+    (repo / "CLAUDE.md").write_text("# conventions\n")
+    (repo / "calc.py").write_text("def add(a, b):\n    return a + b\n")
+    _git(["add", "-A"], repo)
+    _git(["commit", "-m", "init"], repo)
+
+    cfg = BenchmarkConfig(
+        name="reg", repo=repo, task_prompt="do it", model="m",
+        branches={"treatment": "archie-bench/with-archie",
+                  "control": "archie-bench/no-archie"},
+        repetitions=1, judge=JudgeConfig(), timeout_seconds=60,
+    )
+    status = orchestrator.prepare_branches(cfg)
+    assert status["archie_present"] is True  # control got an extra strip commit
+
+    # Real worktree creation; only run_fn/judge_fn/store_fn/diff_fn are stubbed.
+    result = orchestrator.run_benchmark(
+        cfg,
+        run_fn=lambda p, m, cwd, t: (SampleMetrics(completed=True, cost_usd=1.0), "raw"),
+        judge_fn=lambda *a, **k: {"treatment": {"overall": 8},
+                                  "control": {"overall": 6}, "seed": 0},
+        store_fn=lambda r, s, p: {"mode": "offline", "run": r},
+        diff_fn=lambda wt: "diff",
+    )
+    assert result["aggregate"]["treatment"]["n"] == 1
+    assert result["aggregate"]["control"]["n"] == 1
+    # git_base_commit is the shared merge-base (a real 40-char sha)
+    assert len(result["run"]["git_base_commit"]) == 40
+
+
+def test_empty_diff_marks_not_attempted(tmp_path, monkeypatch):
+    monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "same")
+    monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "base")
+    import contextlib
+    @contextlib.contextmanager
+    def fake_worktree(repo, branch, dest):
+        yield tmp_path / ("wt-" + branch)
+    monkeypatch.setattr(orchestrator, "worktree", fake_worktree)
+    monkeypatch.setattr(orchestrator, "prune", lambda repo: None)
+
+    # treatment produces a diff; control produces an empty (whitespace-only) diff
+    def diff_fn(wt):
+        return "real change\n" if "treatment" in str(wt) else "   \n"
+
+    stored = {}
+    orchestrator.run_benchmark(
+        _cfg(tmp_path, reps=1),
+        run_fn=lambda p, m, cwd, t: (SampleMetrics(completed=True, cost_usd=1.0), "raw"),
+        judge_fn=lambda *a, **k: {"treatment": {"overall": 7},
+                                  "control": {"overall": 1}, "seed": 0},
+        store_fn=lambda r, s, p: stored.update(samples=s),
+        diff_fn=diff_fn,
+    )
+    arms = {s["arm"]: s for s in stored["samples"]}
+    assert arms["treatment"]["attempted"] is True
+    assert arms["control"]["attempted"] is False
diff --git a/tests/benchmark/test_prepare.py b/tests/benchmark/test_prepare.py
new file mode 100644
index 00000000..c6333ebe
--- /dev/null
+++ b/tests/benchmark/test_prepare.py
@@ -0,0 +1,115 @@
+# tests/benchmark/test_prepare.py
+import subprocess
+import pytest
+from archie.benchmark.config import BenchmarkConfig, JudgeConfig
+from archie.benchmark import orchestrator as orch
+
+
+def _git(args, cwd):
+    subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True)
+
+
+def _repo(tmp_path, with_archie):
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    _git(["init"], repo)
+    _git(["config", "user.email", "t@t.t"], repo)
+    _git(["config", "user.name", "t"], repo)
+    (repo / "src.py").write_text("print('hi')\n")
+    if with_archie:
+        (repo / "CLAUDE.md").write_text("# context\n")
+        (repo / ".claude").mkdir()
+        (repo / ".claude" / "settings.json").write_text("{}\n")
+    _git(["add", "-A"], repo)
+    _git(["commit", "-m", "init"], repo)
+    return repo
+
+
+def _cfg(repo):
+    return BenchmarkConfig(name="d", repo=repo, task_prompt="x", model="m",
+                           branches={"treatment": "archie-bench/with-archie",
+                                     "control": "archie-bench/no-archie"},
+                           repetitions=1, judge=JudgeConfig(), timeout_seconds=60)
+
+
+def test_clean_tree_required(tmp_path):
+    repo = _repo(tmp_path, with_archie=True)
+    (repo / "dirty.txt").write_text("uncommitted\n")
+    with pytest.raises(ValueError, match="clean"):
+        orch.prepare_branches(_cfg(repo))
+
+
+def test_control_branch_strips_archie_files(tmp_path):
+    repo = _repo(tmp_path, with_archie=True)
+    status = orch.prepare_branches(_cfg(repo))
+    # control branch checked out: Archie files gone
+    _git(["checkout", "archie-bench/no-archie"], repo)
+    assert not (repo / "CLAUDE.md").exists()
+    assert not (repo / ".claude").exists()
+    assert (repo / "src.py").exists()
+    assert status["archie_present"] is True
+    assert status["needs_deep_scan"] is False
+
+
+def test_control_strips_nested_claude_md_with_spaces_in_path(tmp_path):
+    # Regression: directory names with spaces (e.g. Xcode asset catalogs like
+    # "Button icons/") must not survive the strip. The old impl split git
+    # ls-files output on all whitespace, fragmenting space-containing paths so
+    # `git rm` silently skipped them — leaking Archie context onto the control arm.
+    repo = _repo(tmp_path, with_archie=True)
+    nested_dir = repo / "Assets" / "Button icons"
+    nested_dir.mkdir(parents=True)
+    (nested_dir / "CLAUDE.md").write_text("# Button icons\n<!-- archie:ai-start -->\n")
+    (nested_dir / "icon.txt").write_text("keep me\n")  # non-Archie file must survive
+    _git(["add", "-A"], repo)
+    _git(["commit", "-m", "add nested archie context in spaced dir"], repo)
+
+    orch.prepare_branches(_cfg(repo))
+
+    leftover = subprocess.run(
+        ["git", "ls-tree", "-r", "--name-only", "archie-bench/no-archie"],
+        cwd=repo, check=True, capture_output=True, text=True).stdout.splitlines()
+    assert not any(p.endswith("CLAUDE.md") for p in leftover), \
+        f"control arm still has CLAUDE.md files: {[p for p in leftover if p.endswith('CLAUDE.md')]}"
+    assert "Assets/Button icons/icon.txt" in leftover  # real content preserved
+
+
+def test_treatment_keeps_archie_files(tmp_path):
+    repo = _repo(tmp_path, with_archie=True)
+    orch.prepare_branches(_cfg(repo))
+    _git(["checkout", "archie-bench/with-archie"], repo)
+    assert (repo / "CLAUDE.md").exists()
+
+
+def test_no_archie_flags_deep_scan_needed(tmp_path):
+    repo = _repo(tmp_path, with_archie=False)
+    status = orch.prepare_branches(_cfg(repo))
+    assert status["archie_present"] is False
+    assert status["needs_deep_scan"] is True
+
+
+def test_cli_run_invokes_benchmark(tmp_path, monkeypatch):
+    import json
+    from archie.benchmark import cli
+    repo = _repo(tmp_path, with_archie=True)
+    cfg_file = tmp_path / "c.json"
+    cfg_file.write_text(json.dumps({
+        "name": "d", "repo": str(repo), "task_prompt": "x", "model": "m",
+        "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"},
+    }))
+    called = {}
+    monkeypatch.setattr(cli, "run_benchmark",
+                        lambda cfg: called.update(ran=True) or {
+                            "aggregate": {"treatment": {"n": 0, "completed_n": 0,
+                                "attempted_n": 0,
+                                "cost_usd_mean": None, "tool_calls_mean": None,
+                                "duration_ms_mean": None, "quality_mean": None},
+                                "control": {"n": 0, "completed_n": 0, "attempted_n": 0,
+                                "cost_usd_mean": None,
+                                "tool_calls_mean": None, "duration_ms_mean": None,
+                                "quality_mean": None},
+                                "savings": {"cost_pct": None, "tool_calls_pct": None,
+                                "duration_pct": None}},
+                            "store": {"mode": "offline"}})
+    cli.main(["run", str(cfg_file)])
+    assert called["ran"] is True
diff --git a/tests/benchmark/test_runner.py b/tests/benchmark/test_runner.py
new file mode 100644
index 00000000..f48f339c
--- /dev/null
+++ b/tests/benchmark/test_runner.py
@@ -0,0 +1,69 @@
+# tests/benchmark/test_runner.py
+import json
+import subprocess
+import pytest
+from archie.benchmark import runner
+
+
+def _stream():
+    return "\n".join([
+        json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}),
+        json.dumps({"type": "result", "subtype": "success", "total_cost_usd": 0.5,
+                    "duration_ms": 1000, "num_turns": 2,
+                    "usage": {"input_tokens": 10, "output_tokens": 20}}),
+    ])
+
+
+def test_run_claude_parses_metrics(monkeypatch):
+    captured = {}
+
+    def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None):
+        captured["cmd"] = cmd
+        captured["cwd"] = cwd
+        return subprocess.CompletedProcess(cmd, 0, stdout=_stream(), stderr="")
+
+    monkeypatch.setattr(runner.subprocess, "run", fake_run)
+    metrics, raw = runner.run_claude("do it", "claude-sonnet-4-6", "/tmp/wt", 60)
+
+    assert metrics.tool_calls == 1
+    assert metrics.cost_usd == 0.5
+    assert metrics.completed is True
+    assert captured["cwd"] == "/tmp/wt"
+    # identical, fair harness flags must always be present
+    assert captured["cmd"][:2] == ["claude", "-p"]
+    assert "--permission-mode" in captured["cmd"]
+    assert "acceptEdits" in captured["cmd"]
+    assert "stream-json" in captured["cmd"]
+    assert "--model" in captured["cmd"] and "claude-sonnet-4-6" in captured["cmd"]
+
+
+def test_prompt_wraps_task_with_autonomy_framing(monkeypatch):
+    # Headless `claude -p` has no human to approve/answer. Without an explicit
+    # autonomy directive the agent obeys global "describe approach and wait for
+    # approval / ask clarifying questions" rules and stops without editing —
+    # producing an empty diff on both arms. The wrapper must override that.
+    captured = {}
+
+    def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None):
+        captured["cmd"] = cmd
+        return subprocess.CompletedProcess(cmd, 0, stdout=_stream(), stderr="")
+
+    monkeypatch.setattr(runner.subprocess, "run", fake_run)
+    runner.run_claude("Add a sleep timer feature", "m", "/tmp/wt", 60)
+
+    sent = captured["cmd"][2]
+    assert "Add a sleep timer feature" in sent  # original task preserved verbatim
+    low = sent.lower()
+    assert "autonomous" in low                  # framed as autonomous
+    assert "do not ask" in low                  # overrides clarifying-questions rule
+    assert "do not stop" in low                 # overrides wait-for-approval rule
+
+
+def test_timeout_marks_incomplete(monkeypatch):
+    def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None):
+        raise subprocess.TimeoutExpired(cmd, timeout, output=_stream())
+
+    monkeypatch.setattr(runner.subprocess, "run", fake_run)
+    metrics, raw = runner.run_claude("do it", "m", "/tmp/wt", 1)
+    assert metrics.completed is False
+    assert metrics.tool_calls == 1  # partial stdout still parsed
diff --git a/tests/benchmark/test_schema.py b/tests/benchmark/test_schema.py
new file mode 100644
index 00000000..c109f105
--- /dev/null
+++ b/tests/benchmark/test_schema.py
@@ -0,0 +1,18 @@
+# tests/benchmark/test_schema.py
+from pathlib import Path
+
+SQL = Path(__file__).parent.parent.parent / "archie" / "benchmark" / "schema.sql"
+
+
+def test_schema_defines_both_tables_and_view():
+    text = SQL.read_text()
+    assert "create table" in text.lower()
+    assert "benchmark_runs" in text
+    assert "benchmark_samples" in text
+    assert "benchmark_summary" in text
+    # key sample columns referenced by store.py / aggregate.py exist
+    for col in ["tool_calls", "tool_breakdown", "cost_usd", "quality_score",
+                "cache_read_tokens", "judge_seed", "completed", "attempted", "arm"]:
+        assert col in text
+    # prep cost lives on the run, separate from measured samples
+    assert "prep_cost_usd" in text
diff --git a/tests/benchmark/test_store.py b/tests/benchmark/test_store.py
new file mode 100644
index 00000000..e266fd49
--- /dev/null
+++ b/tests/benchmark/test_store.py
@@ -0,0 +1,35 @@
+# tests/benchmark/test_store.py
+import json
+from archie.benchmark import store
+
+
+def test_offline_fallback_when_env_missing(tmp_path, monkeypatch):
+    monkeypatch.delenv("SUPABASE_URL", raising=False)
+    monkeypatch.delenv("SUPABASE_SERVICE_KEY", raising=False)
+    out = tmp_path / "nested" / "results.json"
+    res = store.store_results({"name": "x"}, [{"arm": "treatment"}], out)
+    assert res["mode"] == "offline"
+    saved = json.loads(out.read_text())
+    assert saved["run"]["name"] == "x"
+    assert saved["samples"][0]["arm"] == "treatment"
+
+
+def test_online_write_posts_run_then_samples(tmp_path, monkeypatch):
+    monkeypatch.setenv("SUPABASE_URL", "https://x.supabase.co")
+    monkeypatch.setenv("SUPABASE_SERVICE_KEY", "secret")
+    calls = []
+
+    def fake_poster(url, key, table, rows):
+        calls.append((table, rows))
+        if table == "benchmark_runs":
+            return [{"id": "run-123"}]
+        return rows
+
+    res = store.store_results({"name": "x"}, [{"arm": "treatment"}, {"arm": "control"}],
+                              tmp_path / "r.json", _poster=fake_poster)
+    assert res["mode"] == "online"
+    assert res["run_id"] == "run-123"
+    assert calls[0][0] == "benchmark_runs"
+    assert calls[1][0] == "benchmark_samples"
+    # run_id stamped onto every sample row
+    assert all(r["run_id"] == "run-123" for r in calls[1][1])