BitRaptors · gbrbks · Jun 10, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -73,4 +73,8 @@ htmlcov/
 
 # Ad-hoc plan/spec docs (kept out of repo to avoid accidental commits)
 docs/plans/
+docs/specs/
 docs/superpowers/
+
+# Internal benchmark scratch configs/logs/results
+.archie-bench/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -52,6 +52,18 @@ npx @bitraptors/archie /path/to/project
 python -m pytest tests/ -v
 ```
 
+### Benchmark Harness (internal)
+```bash
+# Measure Archie effectiveness: same task, control (no Archie) vs treatment (full Archie)
+python3 -m archie.benchmark auto /path/to/repo --prompt "..."   # prep + run from a plain repo
+python3 -m archie.benchmark run config.json                     # run on existing branches
+```
+Internal-only (not shipped via npm). Captures tool calls / tokens / cost / time +
+blind judge-Claude quality, writes to Supabase (`benchmark_runs`, `benchmark_samples`).
+Before benchmarking, copy `archie/benchmark/secrets.env.example` → `.archie-bench/secrets.env`
+and fill in the Supabase URL + service_role key (else runs fall back to offline mode).
+See `archie/benchmark/README.md`.
+
 ## Command Architecture
 
 - **`/archie-deep-scan`** — Comprehensive baseline (15-20 min). Full 2-wave AI analysis producing blueprint, per-folder CLAUDE.md, rules, and health metrics. Rerun to refresh the baseline; each run builds on prior findings.

diff --git a/archie/benchmark/README.md b/archie/benchmark/README.md
@@ -0,0 +1,74 @@
+# Archie Benchmark Harness (internal)
+
+Measures Archie's effectiveness: runs the **same** task headlessly on a control
+branch (no Archie) and a treatment branch (full Archie docs + hooks), capturing
+tool calls / tokens / cost / time + a blind judge-Claude quality score, and writes
+results to Supabase. **Not** shipped via npm.
+
+## Setup — do this before your first benchmark
+
+Provide Supabase credentials so results are stored (see [Supabase](#supabase) for
+detail). In short:
+
+```bash
+cp archie/benchmark/secrets.env.example .archie-bench/secrets.env
+# edit .archie-bench/secrets.env: real SUPABASE_URL + service_role key
+set -a; source .archie-bench/secrets.env; set +a
+```
+
+This must be filled in **before** you benchmark if you want results in Supabase. If
+you skip it, runs still work but fall back to **offline mode** (a local
+`results.json`), and nothing is written to the database.
+
+## Usage
+
+```bash
+# 1. Author a config (see example below) — JSON, zero-dep.
+# 2. From a plain repo, prep branches then run:
+python3 -m archie.benchmark auto /path/to/repo --prompt "Add a sleep timer feature"
+
+# Or with a config file:
+python3 -m archie.benchmark run config.json     # branches must already exist
+python3 -m archie.benchmark prep config.json    # only create/refresh branches
+```
+
+If the repo has no Archie files yet, `auto`/`prep` create the branches, then pause
+so you can run `/archie-deep-scan` interactively on the treatment branch. That
+deep-scan is **never** counted in the measured metrics.
+
+## Config
+
+```json
+{
+  "name": "bedtime-add-sleep-timer",
+  "repo": "/Users/you/DEV/BedtimeApp",
+  "task_prompt": "Add a sleep timer feature ...",
+  "model": "claude-sonnet-4-6",
+  "repetitions": 3,
+  "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"},
+  "judge": {"model": "claude-opus-4-8", "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"]},
+  "timeout_seconds": 3600
+}
+```
+
+## Supabase
+
+Copy the credentials template and fill it in (the copy lives in gitignored
+`.archie-bench/`, so real keys are never committed):
+
+```bash
+cp archie/benchmark/secrets.env.example .archie-bench/secrets.env
+# edit .archie-bench/secrets.env: real SUPABASE_URL + service_role key
+set -a; source .archie-bench/secrets.env; set +a
+```
+
+`store.py` reads `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` from the environment.
+Without them the harness writes `.archie/benchmark/<name>/results.json` locally
+(offline mode). Use the **service_role** key (not anon) so inserts bypass RLS, and
+apply `archie/benchmark/schema.sql` in the Supabase SQL Editor once.
+
+## Fairness invariants
+
+- Identical `task_prompt`, `model`, and harness flags on both arms.
+- Both branches descend from the same base commit (enforced).
+- Deep-scan prep cost is separate (`prep_cost_usd`), never in sample metrics.
diff --git a/archie/benchmark/__init__.py b/archie/benchmark/__init__.py
@@ -0,0 +1 @@
+"""Internal Archie effectiveness benchmark harness (not shipped via npm)."""
diff --git a/archie/benchmark/__main__.py b/archie/benchmark/__main__.py
@@ -0,0 +1,4 @@
+from .cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/archie/benchmark/aggregate.py b/archie/benchmark/aggregate.py
@@ -0,0 +1,47 @@
+NUMERIC_FIELDS = ["cost_usd", "tool_calls", "duration_ms", "input_tokens", "output_tokens"]
+
+
+def _mean(values):
+    return sum(values) / len(values) if values else None
+
+
+def _arm_stats(samples):
+    # A sample "attempted" the task if it produced a non-empty diff. Legacy
+    # samples without the flag are treated as attempted (back-compat).
+    stats = {
+        "n": len(samples),
+        "completed_n": sum(1 for s in samples if s.get("completed")),
+        "attempted_n": sum(1 for s in samples if s.get("attempted", True)),
+    }
+    for f in NUMERIC_FIELDS:
+        vals = [s[f] for s in samples if s.get(f) is not None]
+        stats[f + "_mean"] = _mean(vals)
+    # Quality only counts attempts: an empty-diff run that the judge scored low
+    # is "not attempted", not "poor quality" — exclude it from the mean.
+    qvals = [s["quality_score"] for s in samples
+             if s.get("attempted", True) and s.get("quality_score") is not None]
+    stats["quality_mean"] = _mean(qvals)
+    return stats
+
+
+def _pct_lower(treatment, control):
+    """Percent reduction of treatment relative to control (positive = treatment cheaper)."""
+    if treatment is None or control is None or control == 0:
+        return None
+    return round((control - treatment) / control * 100, 1)
+
+
+def aggregate_samples(samples):
+    treatment = [s for s in samples if s.get("arm") == "treatment"]
+    control = [s for s in samples if s.get("arm") == "control"]
+    t_stats = _arm_stats(treatment)
+    c_stats = _arm_stats(control)
+    return {
+        "treatment": t_stats,
+        "control": c_stats,
+        "savings": {
+            "cost_pct": _pct_lower(t_stats["cost_usd_mean"], c_stats["cost_usd_mean"]),
+            "tool_calls_pct": _pct_lower(t_stats["tool_calls_mean"], c_stats["tool_calls_mean"]),
+            "duration_pct": _pct_lower(t_stats["duration_ms_mean"], c_stats["duration_ms_mean"]),
+        },
+    }
diff --git a/archie/benchmark/cli.py b/archie/benchmark/cli.py
@@ -0,0 +1,102 @@
+# archie/benchmark/cli.py
+import argparse
+import sys
+from pathlib import Path
+
+from .config import load_config, parse_config
+from .orchestrator import run_benchmark, prepare_branches
+
+
+def _print_summary(result):
+    agg = result["aggregate"]
+    print("\n=== Benchmark summary ===")
+    for arm in ("treatment", "control"):
+        a = agg[arm]
+        print(f"[{arm}] n={a['n']} attempted={a['attempted_n']} completed={a['completed_n']} "
+              f"cost=${_fmt(a['cost_usd_mean'])} tools={_fmt(a['tool_calls_mean'])} "
+              f"dur={_fmt(a['duration_ms_mean'])}ms quality={_fmt(a['quality_mean'])}")
+    s = agg["savings"]
+    print(f"[savings] cost={_fmt(s['cost_pct'])}%  tools={_fmt(s['tool_calls_pct'])}%  "
+          f"time={_fmt(s['duration_pct'])}%")
+    print(f"[store] {result['store']}")
+
+
+def _fmt(v):
+    return "n/a" if v is None else (f"{v:.2f}" if isinstance(v, float) else str(v))
+
+
+def _cmd_run(args):
+    cfg = load_config(args.config)
+    result = run_benchmark(cfg)
+    _print_summary(result)
+
+
+def _cmd_prep(args):
+    cfg = load_config(args.config)
+    status = prepare_branches(cfg)
+    if status["needs_deep_scan"]:
+        _interactive_deep_scan(cfg)
+    print(f"Branches ready: {cfg.branches}")
+
+
+def _cmd_auto(args):
+    if args.config:
+        cfg = load_config(args.config)
+    else:
+        cfg = parse_config({"name": Path(args.repo).name, "repo": args.repo,
+                            "task_prompt": args.prompt, "model": args.model})
+    status = prepare_branches(cfg)
+    if status["needs_deep_scan"]:
+        _interactive_deep_scan(cfg)
+    result = run_benchmark(cfg)
+    _print_summary(result)
+
+
+def _interactive_deep_scan(cfg):
+    treatment = cfg.branches["treatment"]
+    print("\n" + "=" * 70)
+    print("Archie not found in this repo. Semi-automatic prep:")
+    print(f"  1. In a terminal: git checkout {treatment}")
+    print(f"  2. Install Archie:  npx @bitraptors/archie {cfg.repo}")
+    print("  3. In Claude Code on that branch, run:  /archie-deep-scan")
+    print("  4. Commit the generated files.")
+    print("This deep-scan is NOT counted in the benchmark metrics.")
+    print("=" * 70)
+    input("Press Enter once the treatment branch has committed Archie files... ")
+    # verify
+    from .orchestrator import _git_out, _archie_present  # local import to avoid cycle noise
+    current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], cfg.repo)
+    _git_out(["checkout", treatment], cfg.repo)
+    present = _archie_present(cfg.repo)
+    _git_out(["checkout", current], cfg.repo)
+    if not present:
+        print("ERROR: no Archie files found on the treatment branch. Aborting.", file=sys.stderr)
+        sys.exit(1)
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(prog="archie-benchmark",
+                                     description="Measure Archie effectiveness (control vs treatment).")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_run = sub.add_parser("run", help="run benchmark on existing branches")
+    p_run.add_argument("config", help="path to benchmark config JSON")
+    p_run.set_defaults(func=_cmd_run)
+
+    p_prep = sub.add_parser("prep", help="create/refresh benchmark branches only")
+    p_prep.add_argument("config", help="path to benchmark config JSON")
+    p_prep.set_defaults(func=_cmd_prep)
+
+    p_auto = sub.add_parser("auto", help="prep branches then run, from a plain repo")
+    p_auto.add_argument("repo", nargs="?", help="repo path (when no --config)")
+    p_auto.add_argument("--config", help="path to benchmark config JSON")
+    p_auto.add_argument("--prompt", help="task prompt (when no --config)")
+    p_auto.add_argument("--model", default="claude-sonnet-4-6")
+    p_auto.set_defaults(func=_cmd_auto)
+
+    args = parser.parse_args(argv)
+    args.func(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/archie/benchmark/config.py b/archie/benchmark/config.py
@@ -0,0 +1,64 @@
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+DEFAULT_JUDGE_MODEL = "claude-opus-4-8"
+DEFAULT_RUBRIC = ["correctness", "completeness", "follows_conventions", "no_regressions"]
+DEFAULT_BRANCHES = {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}
+DEFAULT_TIMEOUT = 3600
+DEFAULT_REPETITIONS = 3
+REQUIRED = ("name", "repo", "task_prompt", "model")
+
+
+@dataclass
+class JudgeConfig:
+    model: str = DEFAULT_JUDGE_MODEL
+    rubric: list = field(default_factory=lambda: list(DEFAULT_RUBRIC))
+
+
+@dataclass
+class BenchmarkConfig:
+    name: str
+    repo: Path
+    task_prompt: str
+    model: str
+    branches: dict = field(default_factory=lambda: dict(DEFAULT_BRANCHES))
+    repetitions: int = DEFAULT_REPETITIONS
+    judge: JudgeConfig = field(default_factory=JudgeConfig)
+    timeout_seconds: int = DEFAULT_TIMEOUT
+
+
+def parse_config(data):
+    missing = [k for k in REQUIRED if k not in data or data[k] in (None, "")]
+    if missing:
+        raise ValueError(f"config missing required fields: {', '.join(missing)}")
+
+    branches = data.get("branches", dict(DEFAULT_BRANCHES))
+    for arm in ("treatment", "control"):
+        if arm not in branches or not branches[arm]:
+            raise ValueError(f"config.branches missing '{arm}'")
+
+    reps = int(data.get("repetitions", DEFAULT_REPETITIONS))
+    if reps < 1:
+        raise ValueError("repetitions must be >= 1")
+
+    jd = data.get("judge", {}) or {}
+    judge = JudgeConfig(
+        model=jd.get("model") or DEFAULT_JUDGE_MODEL,
+        rubric=jd.get("rubric") or list(DEFAULT_RUBRIC),
+    )
+
+    return BenchmarkConfig(
+        name=data["name"],
+        repo=Path(data["repo"]).expanduser(),
+        task_prompt=data["task_prompt"],
+        model=data["model"],
+        branches={"treatment": branches["treatment"], "control": branches["control"]},
+        repetitions=reps,
+        judge=judge,
+        timeout_seconds=int(data.get("timeout_seconds", DEFAULT_TIMEOUT)),
+    )
+
+
+def load_config(path):
+    return parse_config(json.loads(Path(path).read_text()))
diff --git a/archie/benchmark/diff.py b/archie/benchmark/diff.py
@@ -0,0 +1,29 @@
+import subprocess
+
+# Build/cache artifacts that are never meaningful for code review. These are
+# excluded from the captured diff so the judge scores only real source changes,
+# even when the target repo lacks a .gitignore for them (the agent may create
+# them as a side effect of running tests). Patterns use git pathspec wildcards
+# (`*` matches across `/`), so `*X*` catches X at any depth.
+_NOISE_GLOBS = [
+    "*__pycache__*",
+    "*.pyc",
+    "*.pyo",
+    "*.DS_Store",
+    "*node_modules*",
+    "*.pytest_cache*",
+    "*.mypy_cache*",
+    "*.ruff_cache*",
+]
+
+
+def capture_diff(worktree_path):
+    """Stage everything (so untracked files show) and return the cached diff text,
+    excluding universal build/cache artifacts (see _NOISE_GLOBS)."""
+    subprocess.run(["git", "add", "-A"], cwd=str(worktree_path),
+                   check=True, capture_output=True, text=True)
+    excludes = [f":(exclude){glob}" for glob in _NOISE_GLOBS]
+    result = subprocess.run(
+        ["git", "diff", "--cached", "--", ".", *excludes],
+        cwd=str(worktree_path), check=True, capture_output=True, text=True)
+    return result.stdout
diff --git a/archie/benchmark/isolation.py b/archie/benchmark/isolation.py
@@ -0,0 +1,22 @@
+# archie/benchmark/isolation.py
+import subprocess
+from contextlib import contextmanager
+from pathlib import Path
+
+
+@contextmanager
+def worktree(repo_path, branch, dest):
+    """Create a git worktree for `branch` at `dest`, always removed on exit."""
+    dest = Path(dest)
+    subprocess.run(["git", "worktree", "add", "--force", str(dest), branch],
+                   cwd=str(repo_path), check=True, capture_output=True, text=True)
+    try:
+        yield dest
+    finally:
+        subprocess.run(["git", "worktree", "remove", "--force", str(dest)],
+                       cwd=str(repo_path), capture_output=True, text=True)
+
+
+def prune(repo_path):
+    subprocess.run(["git", "worktree", "prune"],
+                   cwd=str(repo_path), capture_output=True, text=True)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Internal Archie effectiveness benchmark harness (not shipped via npm)."""