diff --git a/.gitignore b/.gitignore index be008244..8279b2bd 100644 --- a/.gitignore +++ b/.gitignore @@ -73,4 +73,8 @@ htmlcov/ # Ad-hoc plan/spec docs (kept out of repo to avoid accidental commits) docs/plans/ +docs/specs/ docs/superpowers/ + +# Internal benchmark scratch configs/logs/results +.archie-bench/ diff --git a/CLAUDE.md b/CLAUDE.md index 7a9bc7c6..2cda86dc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -52,6 +52,18 @@ npx @bitraptors/archie /path/to/project python -m pytest tests/ -v ``` +### Benchmark Harness (internal) +```bash +# Measure Archie effectiveness: same task, control (no Archie) vs treatment (full Archie) +python3 -m archie.benchmark auto /path/to/repo --prompt "..." # prep + run from a plain repo +python3 -m archie.benchmark run config.json # run on existing branches +``` +Internal-only (not shipped via npm). Captures tool calls / tokens / cost / time + +blind judge-Claude quality, writes to Supabase (`benchmark_runs`, `benchmark_samples`). +Before benchmarking, copy `archie/benchmark/secrets.env.example` → `.archie-bench/secrets.env` +and fill in the Supabase URL + service_role key (else runs fall back to offline mode). +See `archie/benchmark/README.md`. + ## Command Architecture - **`/archie-deep-scan`** — Comprehensive baseline (15-20 min). Full 2-wave AI analysis producing blueprint, per-folder CLAUDE.md, rules, and health metrics. Rerun to refresh the baseline; each run builds on prior findings. diff --git a/archie/benchmark/README.md b/archie/benchmark/README.md new file mode 100644 index 00000000..0ec7f187 --- /dev/null +++ b/archie/benchmark/README.md @@ -0,0 +1,74 @@ +# Archie Benchmark Harness (internal) + +Measures Archie's effectiveness: runs the **same** task headlessly on a control +branch (no Archie) and a treatment branch (full Archie docs + hooks), capturing +tool calls / tokens / cost / time + a blind judge-Claude quality score, and writes +results to Supabase. **Not** shipped via npm. + +## Setup — do this before your first benchmark + +Provide Supabase credentials so results are stored (see [Supabase](#supabase) for +detail). In short: + +```bash +cp archie/benchmark/secrets.env.example .archie-bench/secrets.env +# edit .archie-bench/secrets.env: real SUPABASE_URL + service_role key +set -a; source .archie-bench/secrets.env; set +a +``` + +This must be filled in **before** you benchmark if you want results in Supabase. If +you skip it, runs still work but fall back to **offline mode** (a local +`results.json`), and nothing is written to the database. + +## Usage + +```bash +# 1. Author a config (see example below) — JSON, zero-dep. +# 2. From a plain repo, prep branches then run: +python3 -m archie.benchmark auto /path/to/repo --prompt "Add a sleep timer feature" + +# Or with a config file: +python3 -m archie.benchmark run config.json # branches must already exist +python3 -m archie.benchmark prep config.json # only create/refresh branches +``` + +If the repo has no Archie files yet, `auto`/`prep` create the branches, then pause +so you can run `/archie-deep-scan` interactively on the treatment branch. That +deep-scan is **never** counted in the measured metrics. + +## Config + +```json +{ + "name": "bedtime-add-sleep-timer", + "repo": "/Users/you/DEV/BedtimeApp", + "task_prompt": "Add a sleep timer feature ...", + "model": "claude-sonnet-4-6", + "repetitions": 3, + "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}, + "judge": {"model": "claude-opus-4-8", "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"]}, + "timeout_seconds": 3600 +} +``` + +## Supabase + +Copy the credentials template and fill it in (the copy lives in gitignored +`.archie-bench/`, so real keys are never committed): + +```bash +cp archie/benchmark/secrets.env.example .archie-bench/secrets.env +# edit .archie-bench/secrets.env: real SUPABASE_URL + service_role key +set -a; source .archie-bench/secrets.env; set +a +``` + +`store.py` reads `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` from the environment. +Without them the harness writes `.archie/benchmark//results.json` locally +(offline mode). Use the **service_role** key (not anon) so inserts bypass RLS, and +apply `archie/benchmark/schema.sql` in the Supabase SQL Editor once. + +## Fairness invariants + +- Identical `task_prompt`, `model`, and harness flags on both arms. +- Both branches descend from the same base commit (enforced). +- Deep-scan prep cost is separate (`prep_cost_usd`), never in sample metrics. diff --git a/archie/benchmark/__init__.py b/archie/benchmark/__init__.py new file mode 100644 index 00000000..e5cca8b8 --- /dev/null +++ b/archie/benchmark/__init__.py @@ -0,0 +1 @@ +"""Internal Archie effectiveness benchmark harness (not shipped via npm).""" diff --git a/archie/benchmark/__main__.py b/archie/benchmark/__main__.py new file mode 100644 index 00000000..9ae637f1 --- /dev/null +++ b/archie/benchmark/__main__.py @@ -0,0 +1,4 @@ +from .cli import main + +if __name__ == "__main__": + main() diff --git a/archie/benchmark/aggregate.py b/archie/benchmark/aggregate.py new file mode 100644 index 00000000..11186647 --- /dev/null +++ b/archie/benchmark/aggregate.py @@ -0,0 +1,47 @@ +NUMERIC_FIELDS = ["cost_usd", "tool_calls", "duration_ms", "input_tokens", "output_tokens"] + + +def _mean(values): + return sum(values) / len(values) if values else None + + +def _arm_stats(samples): + # A sample "attempted" the task if it produced a non-empty diff. Legacy + # samples without the flag are treated as attempted (back-compat). + stats = { + "n": len(samples), + "completed_n": sum(1 for s in samples if s.get("completed")), + "attempted_n": sum(1 for s in samples if s.get("attempted", True)), + } + for f in NUMERIC_FIELDS: + vals = [s[f] for s in samples if s.get(f) is not None] + stats[f + "_mean"] = _mean(vals) + # Quality only counts attempts: an empty-diff run that the judge scored low + # is "not attempted", not "poor quality" — exclude it from the mean. + qvals = [s["quality_score"] for s in samples + if s.get("attempted", True) and s.get("quality_score") is not None] + stats["quality_mean"] = _mean(qvals) + return stats + + +def _pct_lower(treatment, control): + """Percent reduction of treatment relative to control (positive = treatment cheaper).""" + if treatment is None or control is None or control == 0: + return None + return round((control - treatment) / control * 100, 1) + + +def aggregate_samples(samples): + treatment = [s for s in samples if s.get("arm") == "treatment"] + control = [s for s in samples if s.get("arm") == "control"] + t_stats = _arm_stats(treatment) + c_stats = _arm_stats(control) + return { + "treatment": t_stats, + "control": c_stats, + "savings": { + "cost_pct": _pct_lower(t_stats["cost_usd_mean"], c_stats["cost_usd_mean"]), + "tool_calls_pct": _pct_lower(t_stats["tool_calls_mean"], c_stats["tool_calls_mean"]), + "duration_pct": _pct_lower(t_stats["duration_ms_mean"], c_stats["duration_ms_mean"]), + }, + } diff --git a/archie/benchmark/cli.py b/archie/benchmark/cli.py new file mode 100644 index 00000000..f9d6be6f --- /dev/null +++ b/archie/benchmark/cli.py @@ -0,0 +1,102 @@ +# archie/benchmark/cli.py +import argparse +import sys +from pathlib import Path + +from .config import load_config, parse_config +from .orchestrator import run_benchmark, prepare_branches + + +def _print_summary(result): + agg = result["aggregate"] + print("\n=== Benchmark summary ===") + for arm in ("treatment", "control"): + a = agg[arm] + print(f"[{arm}] n={a['n']} attempted={a['attempted_n']} completed={a['completed_n']} " + f"cost=${_fmt(a['cost_usd_mean'])} tools={_fmt(a['tool_calls_mean'])} " + f"dur={_fmt(a['duration_ms_mean'])}ms quality={_fmt(a['quality_mean'])}") + s = agg["savings"] + print(f"[savings] cost={_fmt(s['cost_pct'])}% tools={_fmt(s['tool_calls_pct'])}% " + f"time={_fmt(s['duration_pct'])}%") + print(f"[store] {result['store']}") + + +def _fmt(v): + return "n/a" if v is None else (f"{v:.2f}" if isinstance(v, float) else str(v)) + + +def _cmd_run(args): + cfg = load_config(args.config) + result = run_benchmark(cfg) + _print_summary(result) + + +def _cmd_prep(args): + cfg = load_config(args.config) + status = prepare_branches(cfg) + if status["needs_deep_scan"]: + _interactive_deep_scan(cfg) + print(f"Branches ready: {cfg.branches}") + + +def _cmd_auto(args): + if args.config: + cfg = load_config(args.config) + else: + cfg = parse_config({"name": Path(args.repo).name, "repo": args.repo, + "task_prompt": args.prompt, "model": args.model}) + status = prepare_branches(cfg) + if status["needs_deep_scan"]: + _interactive_deep_scan(cfg) + result = run_benchmark(cfg) + _print_summary(result) + + +def _interactive_deep_scan(cfg): + treatment = cfg.branches["treatment"] + print("\n" + "=" * 70) + print("Archie not found in this repo. Semi-automatic prep:") + print(f" 1. In a terminal: git checkout {treatment}") + print(f" 2. Install Archie: npx @bitraptors/archie {cfg.repo}") + print(" 3. In Claude Code on that branch, run: /archie-deep-scan") + print(" 4. Commit the generated files.") + print("This deep-scan is NOT counted in the benchmark metrics.") + print("=" * 70) + input("Press Enter once the treatment branch has committed Archie files... ") + # verify + from .orchestrator import _git_out, _archie_present # local import to avoid cycle noise + current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], cfg.repo) + _git_out(["checkout", treatment], cfg.repo) + present = _archie_present(cfg.repo) + _git_out(["checkout", current], cfg.repo) + if not present: + print("ERROR: no Archie files found on the treatment branch. Aborting.", file=sys.stderr) + sys.exit(1) + + +def main(argv=None): + parser = argparse.ArgumentParser(prog="archie-benchmark", + description="Measure Archie effectiveness (control vs treatment).") + sub = parser.add_subparsers(dest="command", required=True) + + p_run = sub.add_parser("run", help="run benchmark on existing branches") + p_run.add_argument("config", help="path to benchmark config JSON") + p_run.set_defaults(func=_cmd_run) + + p_prep = sub.add_parser("prep", help="create/refresh benchmark branches only") + p_prep.add_argument("config", help="path to benchmark config JSON") + p_prep.set_defaults(func=_cmd_prep) + + p_auto = sub.add_parser("auto", help="prep branches then run, from a plain repo") + p_auto.add_argument("repo", nargs="?", help="repo path (when no --config)") + p_auto.add_argument("--config", help="path to benchmark config JSON") + p_auto.add_argument("--prompt", help="task prompt (when no --config)") + p_auto.add_argument("--model", default="claude-sonnet-4-6") + p_auto.set_defaults(func=_cmd_auto) + + args = parser.parse_args(argv) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/archie/benchmark/config.py b/archie/benchmark/config.py new file mode 100644 index 00000000..3cdff169 --- /dev/null +++ b/archie/benchmark/config.py @@ -0,0 +1,64 @@ +import json +from dataclasses import dataclass, field +from pathlib import Path + +DEFAULT_JUDGE_MODEL = "claude-opus-4-8" +DEFAULT_RUBRIC = ["correctness", "completeness", "follows_conventions", "no_regressions"] +DEFAULT_BRANCHES = {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"} +DEFAULT_TIMEOUT = 3600 +DEFAULT_REPETITIONS = 3 +REQUIRED = ("name", "repo", "task_prompt", "model") + + +@dataclass +class JudgeConfig: + model: str = DEFAULT_JUDGE_MODEL + rubric: list = field(default_factory=lambda: list(DEFAULT_RUBRIC)) + + +@dataclass +class BenchmarkConfig: + name: str + repo: Path + task_prompt: str + model: str + branches: dict = field(default_factory=lambda: dict(DEFAULT_BRANCHES)) + repetitions: int = DEFAULT_REPETITIONS + judge: JudgeConfig = field(default_factory=JudgeConfig) + timeout_seconds: int = DEFAULT_TIMEOUT + + +def parse_config(data): + missing = [k for k in REQUIRED if k not in data or data[k] in (None, "")] + if missing: + raise ValueError(f"config missing required fields: {', '.join(missing)}") + + branches = data.get("branches", dict(DEFAULT_BRANCHES)) + for arm in ("treatment", "control"): + if arm not in branches or not branches[arm]: + raise ValueError(f"config.branches missing '{arm}'") + + reps = int(data.get("repetitions", DEFAULT_REPETITIONS)) + if reps < 1: + raise ValueError("repetitions must be >= 1") + + jd = data.get("judge", {}) or {} + judge = JudgeConfig( + model=jd.get("model") or DEFAULT_JUDGE_MODEL, + rubric=jd.get("rubric") or list(DEFAULT_RUBRIC), + ) + + return BenchmarkConfig( + name=data["name"], + repo=Path(data["repo"]).expanduser(), + task_prompt=data["task_prompt"], + model=data["model"], + branches={"treatment": branches["treatment"], "control": branches["control"]}, + repetitions=reps, + judge=judge, + timeout_seconds=int(data.get("timeout_seconds", DEFAULT_TIMEOUT)), + ) + + +def load_config(path): + return parse_config(json.loads(Path(path).read_text())) diff --git a/archie/benchmark/diff.py b/archie/benchmark/diff.py new file mode 100644 index 00000000..03bf1619 --- /dev/null +++ b/archie/benchmark/diff.py @@ -0,0 +1,29 @@ +import subprocess + +# Build/cache artifacts that are never meaningful for code review. These are +# excluded from the captured diff so the judge scores only real source changes, +# even when the target repo lacks a .gitignore for them (the agent may create +# them as a side effect of running tests). Patterns use git pathspec wildcards +# (`*` matches across `/`), so `*X*` catches X at any depth. +_NOISE_GLOBS = [ + "*__pycache__*", + "*.pyc", + "*.pyo", + "*.DS_Store", + "*node_modules*", + "*.pytest_cache*", + "*.mypy_cache*", + "*.ruff_cache*", +] + + +def capture_diff(worktree_path): + """Stage everything (so untracked files show) and return the cached diff text, + excluding universal build/cache artifacts (see _NOISE_GLOBS).""" + subprocess.run(["git", "add", "-A"], cwd=str(worktree_path), + check=True, capture_output=True, text=True) + excludes = [f":(exclude){glob}" for glob in _NOISE_GLOBS] + result = subprocess.run( + ["git", "diff", "--cached", "--", ".", *excludes], + cwd=str(worktree_path), check=True, capture_output=True, text=True) + return result.stdout diff --git a/archie/benchmark/isolation.py b/archie/benchmark/isolation.py new file mode 100644 index 00000000..aaea5d94 --- /dev/null +++ b/archie/benchmark/isolation.py @@ -0,0 +1,22 @@ +# archie/benchmark/isolation.py +import subprocess +from contextlib import contextmanager +from pathlib import Path + + +@contextmanager +def worktree(repo_path, branch, dest): + """Create a git worktree for `branch` at `dest`, always removed on exit.""" + dest = Path(dest) + subprocess.run(["git", "worktree", "add", "--force", str(dest), branch], + cwd=str(repo_path), check=True, capture_output=True, text=True) + try: + yield dest + finally: + subprocess.run(["git", "worktree", "remove", "--force", str(dest)], + cwd=str(repo_path), capture_output=True, text=True) + + +def prune(repo_path): + subprocess.run(["git", "worktree", "prune"], + cwd=str(repo_path), capture_output=True, text=True) diff --git a/archie/benchmark/judge.py b/archie/benchmark/judge.py new file mode 100644 index 00000000..210ed07d --- /dev/null +++ b/archie/benchmark/judge.py @@ -0,0 +1,68 @@ +# archie/benchmark/judge.py +import json +import subprocess + + +def assign_order(seed): + """Return (treatment_variant, control_variant) — blind A/B label assignment.""" + return ("a", "b") if seed % 2 == 0 else ("b", "a") + + +def build_judge_prompt(task_prompt, diff_a, diff_b, rubric): + axes = ", ".join(rubric) + schema = ('{"variant_a": {' + ", ".join(f'"{a}": int' for a in rubric) + + ', "overall": number, "justification": string}, "variant_b": {... same keys ...}}') + return ( + "You are an impartial senior code reviewer. Two AI agents independently " + "attempted the SAME task. You are shown each agent's diff as an anonymous " + "variant. Judge purely on the code; you do not know anything about how each " + "was produced.\n\n" + f"TASK GIVEN TO BOTH AGENTS:\n{task_prompt}\n\n" + f"Score each variant on these axes (each 1-10): {axes}. Also give an " + "'overall' score (0-10) and a one-sentence 'justification'.\n\n" + f"Respond with ONLY a JSON object of this exact shape:\n{schema}\n\n" + f"=== VARIANT A DIFF ===\n{diff_a}\n\n" + f"=== VARIANT B DIFF ===\n{diff_b}\n" + ) + + +def parse_judge_output(text): + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end < start: + raise ValueError("no JSON object found in judge output") + return json.loads(text[start:end + 1]) + + +def _default_runner(prompt, model, timeout): + proc = subprocess.run( + ["claude", "-p", prompt, "--model", model, "--output-format", "text"], + capture_output=True, text=True, timeout=timeout, + ) + return proc.stdout + + +def run_judge(task_prompt, treatment_diff, control_diff, rubric, model, seed, + timeout_seconds=600, _runner=None): + t_variant, c_variant = assign_order(seed) + diff_a = treatment_diff if t_variant == "a" else control_diff + diff_b = treatment_diff if t_variant == "b" else control_diff + prompt = build_judge_prompt(task_prompt, diff_a, diff_b, rubric) + + runner = _runner or _default_runner + parsed = None + last_err = None + for _ in range(2): + try: + parsed = parse_judge_output(runner(prompt, model, timeout_seconds)) + break + except (ValueError, json.JSONDecodeError) as e: + last_err = e + if parsed is None: + raise ValueError(f"judge returned unparseable output twice: {last_err}") + + return { + "treatment": parsed["variant_a"] if t_variant == "a" else parsed["variant_b"], + "control": parsed["variant_a"] if c_variant == "a" else parsed["variant_b"], + "seed": seed, + } diff --git a/archie/benchmark/metrics.py b/archie/benchmark/metrics.py new file mode 100644 index 00000000..3f9d6121 --- /dev/null +++ b/archie/benchmark/metrics.py @@ -0,0 +1,47 @@ +# archie/benchmark/metrics.py +import json +from dataclasses import dataclass, field + + +@dataclass +class SampleMetrics: + tool_calls: int = 0 + tool_breakdown: dict = field(default_factory=dict) + input_tokens: int = 0 + output_tokens: int = 0 + cache_read_tokens: int = 0 + cache_creation_tokens: int = 0 + cost_usd: float = 0.0 + duration_ms: int = 0 + num_turns: int = 0 + completed: bool = False + + +def parse_stream(lines): + m = SampleMetrics() + for line in lines: + line = line.strip() + if not line: + continue + try: + ev = json.loads(line) + except json.JSONDecodeError: + continue + etype = ev.get("type") + if etype == "assistant": + for block in ev.get("message", {}).get("content", []) or []: + if isinstance(block, dict) and block.get("type") == "tool_use": + m.tool_calls += 1 + name = block.get("name", "unknown") + m.tool_breakdown[name] = m.tool_breakdown.get(name, 0) + 1 + elif etype == "result": + usage = ev.get("usage", {}) or {} + m.input_tokens = usage.get("input_tokens", 0) + m.output_tokens = usage.get("output_tokens", 0) + m.cache_read_tokens = usage.get("cache_read_input_tokens", 0) + m.cache_creation_tokens = usage.get("cache_creation_input_tokens", 0) + m.cost_usd = ev.get("total_cost_usd", 0.0) + m.duration_ms = ev.get("duration_ms", 0) + m.num_turns = ev.get("num_turns", 0) + m.completed = ev.get("subtype") == "success" + return m diff --git a/archie/benchmark/orchestrator.py b/archie/benchmark/orchestrator.py new file mode 100644 index 00000000..7a2285b6 --- /dev/null +++ b/archie/benchmark/orchestrator.py @@ -0,0 +1,224 @@ +# archie/benchmark/orchestrator.py +import subprocess +import hashlib +from pathlib import Path + +from .isolation import worktree, prune +from .diff import capture_diff +from .runner import run_claude +from .judge import run_judge +from .store import store_results +from .aggregate import aggregate_samples +from .metrics import SampleMetrics + + +def _git_out(args, cwd): + return subprocess.run(["git", *args], cwd=str(cwd), check=True, + capture_output=True, text=True).stdout.strip() + + +def _base_commit(repo): + return _git_out(["rev-parse", "HEAD"], repo) + + +def _merge_base(repo, a, b): + """The common-ancestor commit of two branches. + + Used to verify both arms descend from the same base. We compare the + merge-base (not branch tips): prep intentionally adds a strip/deep-scan + commit to an arm, so the tips legitimately differ while the shared base + does not. + """ + return _git_out(["merge-base", a, b], repo) + + +def _seed(name, repetition): + h = hashlib.sha256(f"{name}:{repetition}".encode("utf-8")).hexdigest() + return int(h[:8], 16) + + +def _worktrees_root(repo): + root = Path(repo) / ".archie" / "benchmark" / "worktrees" + root.mkdir(parents=True, exist_ok=True) + return root + + +def _run_one(cfg, branch, repetition, run_fn, diff_fn): + """Run a single (branch, repetition) sample; return (metrics, diff).""" + root = _worktrees_root(cfg.repo) + dest = root / f"{branch.replace('/', '_')}-{repetition}" + with worktree(cfg.repo, branch, dest) as wt: + try: + metrics, _raw = run_fn(cfg.task_prompt, cfg.model, wt, cfg.timeout_seconds) + except Exception: + return SampleMetrics(completed=False), "" + diff = diff_fn(wt) + return metrics, diff + + +def _sample_row(arm, repetition, metrics, quality_score, quality_detail, seed, attempted): + return { + "arm": arm, + "repetition": repetition, + "tool_calls": metrics.tool_calls, + "tool_breakdown": metrics.tool_breakdown, + "input_tokens": metrics.input_tokens, + "output_tokens": metrics.output_tokens, + "cache_read_tokens": metrics.cache_read_tokens, + "cache_creation_tokens": metrics.cache_creation_tokens, + "cost_usd": metrics.cost_usd, + "duration_ms": metrics.duration_ms, + "num_turns": metrics.num_turns, + "completed": metrics.completed, + "attempted": attempted, + "quality_score": quality_score, + "quality_detail": quality_detail, + "judge_seed": seed, + } + + +def run_benchmark(cfg, run_fn=run_claude, judge_fn=run_judge, + store_fn=store_results, diff_fn=capture_diff): + # Fairness guard: both arms must descend from the same base commit. We + # compare the merge-base, not branch tips — prep adds a strip/deep-scan + # commit to an arm, so tips differ while the common base must not. + try: + base = _merge_base(cfg.repo, cfg.branches["treatment"], cfg.branches["control"]) + except subprocess.CalledProcessError: + base = "" + if not base: + raise ValueError( + "benchmark arms have no common ancestor base commit; both branches " + "must branch from the same commit") + + prune(cfg.repo) + samples = [] + for rep in range(cfg.repetitions): + t_metrics, t_diff = _run_one(cfg, cfg.branches["treatment"], rep, run_fn, diff_fn) + c_metrics, c_diff = _run_one(cfg, cfg.branches["control"], rep, run_fn, diff_fn) + + # "Attempted" = the agent actually produced a code change. An empty + # diff means the task was not attempted, regardless of how the judge + # scores it — tracked so it can be excluded from quality means. + t_attempted = bool((t_diff or "").strip()) + c_attempted = bool((c_diff or "").strip()) + + seed = _seed(cfg.name, rep) + try: + verdict = judge_fn(cfg.task_prompt, t_diff, c_diff, cfg.judge.rubric, + cfg.judge.model, seed) + t_q = verdict["treatment"] + c_q = verdict["control"] + except Exception: + # A judge failure must not discard the (expensive) completed runs; + # record the samples without a quality score instead of aborting. + t_q = c_q = None + samples.append(_sample_row("treatment", rep, t_metrics, + t_q.get("overall") if t_q else None, t_q, seed, + t_attempted)) + samples.append(_sample_row("control", rep, c_metrics, + c_q.get("overall") if c_q else None, c_q, seed, + c_attempted)) + prune(cfg.repo) + + agg = aggregate_samples(samples) + run_row = { + "name": cfg.name, + "repo_name": Path(cfg.repo).name, + "task_prompt": cfg.task_prompt, + "model": cfg.model, + "judge_model": cfg.judge.model, + "repetitions": cfg.repetitions, + "git_base_commit": base, + "prep_cost_usd": None, + "archie_version": _archie_version(), + } + offline_path = Path(cfg.repo) / ".archie" / "benchmark" / cfg.name / "results.json" + store_result = store_fn(run_row, samples, offline_path) + return {"aggregate": agg, "samples": samples, "store": store_result, "run": run_row} + + +def _archie_version(): + try: + from archie import __version__ + return __version__ + except Exception: + return "unknown" + + +ARCHIE_PATHS = ["CLAUDE.md", "AGENTS.md", ".claude", ".archie"] + + +def _is_clean(repo): + out = _git_out(["status", "--porcelain"], repo) + return out == "" + + +def _archie_present(repo): + return any((Path(repo) / p).exists() for p in ARCHIE_PATHS) + + +def _branch_exists(repo, branch): + res = subprocess.run(["git", "rev-parse", "--verify", branch], + cwd=str(repo), capture_output=True, text=True) + return res.returncode == 0 + + +def _create_branch(repo, branch, base): + if _branch_exists(repo, branch): + subprocess.run(["git", "branch", "-D", branch], cwd=str(repo), + capture_output=True, text=True) + _git_out(["branch", branch, base], repo) + + +def _strip_archie_on_branch(repo, branch): + """Check out branch, remove Archie artifacts (incl. per-folder CLAUDE.md), commit.""" + current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], repo) + _git_out(["checkout", branch], repo) + try: + # remove root-level + nested CLAUDE.md and known Archie dirs/files + subprocess.run(["git", "rm", "-r", "--quiet", "--ignore-unmatch", + *ARCHIE_PATHS], cwd=str(repo), capture_output=True, text=True) + # nested per-folder CLAUDE.md files. Use -z (NUL-delimited) so paths + # containing spaces (e.g. Xcode "Button icons/") are not fragmented — + # splitting on whitespace would break them and git rm would skip them, + # leaking Archie context onto the control arm. + out = subprocess.run(["git", "ls-files", "-z", "*/CLAUDE.md"], cwd=str(repo), + capture_output=True, text=True).stdout + nested = [p for p in out.split("\0") if p] + if nested: + subprocess.run(["git", "rm", "--quiet", "--ignore-unmatch", *nested], + cwd=str(repo), capture_output=True, text=True) + if not _is_clean(repo): + _git_out(["commit", "-m", "benchmark: strip Archie artifacts (control arm)"], repo) + finally: + _git_out(["checkout", current], repo) + + +def prepare_branches(cfg): + """Create control (no Archie) and treatment (with Archie) branches from current HEAD. + + Returns a status dict; if Archie is absent, `needs_deep_scan` is True and the + caller (cli) must run the interactive deep-scan on the treatment branch. + """ + repo = cfg.repo + if not _is_clean(repo): + raise ValueError("working tree is not clean; commit or stash before benchmarking") + + base = _base_commit(repo) + archie_present = _archie_present(repo) + + _create_branch(repo, cfg.branches["treatment"], base) + _create_branch(repo, cfg.branches["control"], base) + + if archie_present: + _strip_archie_on_branch(repo, cfg.branches["control"]) + # if absent, control already has no Archie files; treatment will be populated + # by the interactive deep-scan (cli handles the pause). + + return { + "archie_present": archie_present, + "needs_deep_scan": not archie_present, + "base": base, + "branches": cfg.branches, + } diff --git a/archie/benchmark/runner.py b/archie/benchmark/runner.py new file mode 100644 index 00000000..e4475f6f --- /dev/null +++ b/archie/benchmark/runner.py @@ -0,0 +1,55 @@ +# archie/benchmark/runner.py +import subprocess +from .metrics import parse_stream + +# Headless `claude -p` runs with no human in the loop. Global/project agent rules +# ("describe your approach and wait for approval", "ask clarifying questions", +# "stop and split tasks > 3 files") otherwise make the agent plan-and-stop without +# editing, yielding an empty diff on both arms. This preamble — applied identically +# to both arms, so it stays fair — forces autonomous completion. The same on-disk +# Archie context (or its absence) is still what differs between arms. +AUTONOMY_PREAMBLE = ( + "You are running fully autonomously in a non-interactive headless session. " + "There is NO human available to answer questions or approve anything; if you " + "stop to ask or to wait for approval, the task simply fails.\n" + "Implement the task below completely and immediately:\n" + "- Edit the files directly to a finished, working state.\n" + "- Do NOT ask clarifying questions — make reasonable assumptions and proceed.\n" + "- Do NOT stop to present a plan or wait for approval; just do the work.\n" + "- Do NOT merely analyze, summarize, or hand off to a plan — produce the " + "actual code changes.\n" + "- Ignore any rule that says to pause, seek confirmation, or split the work " + "across sessions; finish it now, in this session.\n\n" + "=== TASK ===\n" +) + + +def build_prompt(task_prompt): + """Wrap the raw task in the autonomy preamble (identical for both arms).""" + return AUTONOMY_PREAMBLE + task_prompt + + +def run_claude(prompt, model, cwd, timeout_seconds): + """Run a headless Claude Code session in `cwd`; return (SampleMetrics, raw_stdout). + + Both benchmark arms must call this with identical flags — the only difference + between arms is the on-disk files (CLAUDE.md / .claude hooks), never the flags. + """ + cmd = [ + "claude", "-p", build_prompt(prompt), + "--model", model, + "--output-format", "stream-json", "--verbose", + "--permission-mode", "acceptEdits", + ] + try: + proc = subprocess.run(cmd, cwd=str(cwd), capture_output=True, + text=True, timeout=timeout_seconds) + metrics = parse_stream(proc.stdout.splitlines()) + return metrics, proc.stdout + except subprocess.TimeoutExpired as e: + partial = e.output or "" + if isinstance(partial, bytes): + partial = partial.decode("utf-8", "replace") + metrics = parse_stream(partial.splitlines()) + metrics.completed = False + return metrics, partial diff --git a/archie/benchmark/schema.sql b/archie/benchmark/schema.sql new file mode 100644 index 00000000..62f117e0 --- /dev/null +++ b/archie/benchmark/schema.sql @@ -0,0 +1,62 @@ +-- archie/benchmark/schema.sql +-- Archie benchmark harness — Supabase schema (v1). +-- Run manually against the project (or via CI). Idempotent-ish: uses IF NOT EXISTS. + +create table if not exists benchmark_runs ( + id uuid primary key default gen_random_uuid(), + name text not null, + repo_name text, -- basename only, never a full path + task_prompt text, + model text, + judge_model text, + repetitions int, + git_base_commit text, + prep_cost_usd numeric, -- deep-scan prep cost, separate & best-effort + archie_version text, + created_at timestamptz not null default now() +); + +create table if not exists benchmark_samples ( + id uuid primary key default gen_random_uuid(), + run_id uuid not null references benchmark_runs(id) on delete cascade, + arm text not null, -- 'control' | 'treatment' + repetition int, + tool_calls int, + tool_breakdown jsonb, + input_tokens int, + output_tokens int, + cache_read_tokens int, + cache_creation_tokens int, + cost_usd numeric, + duration_ms int, + num_turns int, + completed boolean, + attempted boolean, -- agent produced a non-empty diff + quality_score numeric, + quality_detail jsonb, + judge_seed int, + created_at timestamptz not null default now() +); + +create index if not exists benchmark_samples_run_id_idx on benchmark_samples(run_id); + +-- Per-run, per-arm rollup the website reads (separate spec). +create or replace view benchmark_summary as +select + r.id as run_id, + r.name as name, + r.repo_name as repo_name, + r.model as model, + s.arm as arm, + count(*) as samples, + count(*) filter (where s.completed) as completed_samples, + count(*) filter (where s.attempted) as attempted_samples, + avg(s.tool_calls) as tool_calls_mean, + avg(s.cost_usd) as cost_usd_mean, + avg(s.duration_ms) as duration_ms_mean, + avg(s.input_tokens + s.output_tokens) as total_tokens_mean, + -- quality only over real attempts (empty-diff runs excluded) + avg(s.quality_score) filter (where s.attempted) as quality_mean +from benchmark_runs r +join benchmark_samples s on s.run_id = r.id +group by r.id, r.name, r.repo_name, r.model, s.arm; diff --git a/archie/benchmark/secrets.env.example b/archie/benchmark/secrets.env.example new file mode 100644 index 00000000..13c722a5 --- /dev/null +++ b/archie/benchmark/secrets.env.example @@ -0,0 +1,23 @@ +# Archie benchmark — Supabase credentials TEMPLATE. +# +# This file is tracked so others know what to provide. Do NOT put real keys here. +# Copy it to a gitignored location and fill in your values: +# +# cp archie/benchmark/secrets.env.example .archie-bench/secrets.env +# # then edit .archie-bench/secrets.env with your real URL + service_role key +# +# Load it before a run (exports the vars into the environment): +# +# set -a; source .archie-bench/secrets.env; set +a +# +# .archie-bench/ is gitignored — keep real keys there, never commit a filled copy. +# +# Notes: +# - URL: no trailing slash, no /rest/v1 — store.py appends /rest/v1/. +# - KEY: must be the service_role key (NOT anon) so inserts bypass RLS. +# Supabase -> Project Settings -> API -> service_role. +# - One-time: apply archie/benchmark/schema.sql in the Supabase SQL Editor. +# - Without these vars set, the harness runs OFFLINE (writes a local results.json). + +SUPABASE_URL=https://REPLACE-WITH-PROJECT-REF.supabase.co +SUPABASE_SERVICE_KEY=REPLACE-WITH-SERVICE-ROLE-KEY diff --git a/archie/benchmark/store.py b/archie/benchmark/store.py new file mode 100644 index 00000000..3df6adae --- /dev/null +++ b/archie/benchmark/store.py @@ -0,0 +1,43 @@ +# archie/benchmark/store.py +import json +import os +import urllib.request +from pathlib import Path + + +def _env(): + return os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_SERVICE_KEY") + + +def _post(url, key, table, rows): + data = json.dumps(rows).encode("utf-8") + req = urllib.request.Request( + f"{url}/rest/v1/{table}", + data=data, + headers={ + "apikey": key, + "Authorization": f"Bearer {key}", + "Content-Type": "application/json", + "Prefer": "return=representation", + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def store_results(run_row, sample_rows, offline_path, _poster=None): + url, key = _env() + if not url or not key: + path = Path(offline_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps({"run": run_row, "samples": sample_rows}, indent=2)) + return {"mode": "offline", "path": str(path)} + + poster = _poster or _post + created = poster(url, key, "benchmark_runs", [run_row]) + run_id = created[0]["id"] + for r in sample_rows: + r["run_id"] = run_id + poster(url, key, "benchmark_samples", sample_rows) + return {"mode": "online", "run_id": run_id} diff --git a/docs/specs/2026-06-02-archie-benchmark-harness-design.md b/docs/specs/2026-06-02-archie-benchmark-harness-design.md deleted file mode 100644 index f131bfa5..00000000 --- a/docs/specs/2026-06-02-archie-benchmark-harness-design.md +++ /dev/null @@ -1,231 +0,0 @@ -# Archie Benchmark Harness — Design - -**Date:** 2026-06-02 -**Status:** Approved (design) — pending implementation plan -**Scope:** Internal benchmarking tool that measures Archie's effectiveness by running the *same* task with and without Archie's generated context, capturing efficiency + quality metrics, and storing results in Supabase. Website display is a **separate follow-up spec**. - ---- - -## 1. Purpose - -Prove (or disprove) Archie's value with hard numbers. For a given repository and a given coding task, run an identical headless Claude Code session in two arms: - -- **control** — repo without any Archie artifacts. -- **treatment** — repo with the full Archie experience: root `CLAUDE.md` / `AGENTS.md`, per-folder `CLAUDE.md` context files, rules, **and** the real-time enforcement hooks. - -For each arm we capture **tool calls, tokens, cost, wall-clock duration** (efficiency) **and a blind judge-Claude quality score** (correctness/completeness/conventions). Measuring cost alone is misleading — an agent that does nothing is cheapest — so quality is a first-class output. - -The benchmark is an **internal tool** (the team runs it on controlled repos to produce marketing numbers). The Supabase write key lives in a local `.env` / CI secret; results are written directly. No end-user consent/anonymization layer is in scope. - ---- - -## 2. Key Decisions (resolved during brainstorming) - -| Decision | Choice | -|---|---| -| What we measure | Efficiency (tool calls, tokens, cost, time) **+ quality** (judge score) | -| Execution engine | **Claude Code headless** — `claude -p ... --output-format stream-json` | -| Treatment contents | **Everything**: context docs + rules + enforcement hooks | -| Arm source | **Branch-based**: a treatment branch (with Archie files) and a control branch (without). Tool can prep both from a plain repo. | -| Quality measurement | **Blind judge-Claude** scored against a rubric (no pre-written tests required) | -| Repetitions | **Configurable, default 3** per arm (average + spread) | -| Tool scope | **Internal** — direct Supabase write via service key | -| Website | **Out of scope** — separate follow-up spec | -| Deep-scan prep (when repo has no Archie yet) | **Semi-automatic**: tool prepares branch + installs Archie, then pauses for the user to run `/archie-deep-scan` interactively, then resumes | -| Config format | **JSON** (zero-dep; YAML/TOML rejected — Archie targets Python 3.9+, stdlib only) | - -**Critical invariant:** the deep-scan that *generates* the treatment artifacts is **never** counted in the measured metrics. It runs before measurement, on a separate branch, and its cost is logged separately as `prep_cost`. - ---- - -## 3. Architecture - -New **internal** Python package (zero-dep stdlib; **not** copied into the npm package, **not** a `standalone/` script): - -``` -archie/benchmark/ - __init__.py - cli.py # entry: `python3 -m archie.benchmark {auto,run,prep} ` - config.py # JSON config read + validation - isolation.py # git worktree lifecycle: add / cwd / cleanup / prune - runner.py # one `claude -p` run in a worktree, stream-json - metrics.py # stream-json parse -> {tool_calls, tool_breakdown, tokens, cost, duration, turns, completed} - judge.py # blind judge-Claude call -> rubric scores (forced JSON) - diff.py # `git diff` + untracked files for an arm - store.py # Supabase PostgREST write (urllib, service key from .env) + offline fallback - orchestrator.py # full run: prep -> (arm x repetition) matrix -> aggregate -> store -> summary - schema.sql # versioned Supabase DDL (tables + summary view) -tests/benchmark/ # pytest; claude/supabase/git external calls mocked -``` - -Each file has a single responsibility and is independently testable. `runner`, `judge`, and `store` wrap the only external side effects (claude CLI, HTTP) so they mock cleanly. `cli.py` is a thin arg-parse + `orchestrator` call. - ---- - -## 4. Config format (JSON) - -One file describes one benchmark case: - -```json -{ - "name": "bedtime-add-sleep-timer", - "repo": "/Users/csacsi/DEV/BedtimeApp", - "task_prompt": "Add a sleep timer feature: a setting that stops audio playback after a chosen duration. Wire it into the existing player and settings UI.", - "model": "claude-sonnet-4-6", - "repetitions": 3, - "branches": { - "treatment": "archie-bench/with-archie", - "control": "archie-bench/no-archie" - }, - "judge": { - "model": "claude-opus-4-8", - "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"] - }, - "timeout_seconds": 3600 -} -``` - -Rules: - -- **`task_prompt` is byte-for-byte identical** across both arms and **never mentions Archie**. The presence of context files is the only difference. -- `model` is the same for both arms (fixed, so the model is not a confounding variable). -- `branches`: if both exist → start from them. If missing, the CLI offers prep (§6). -- `judge.rubric`: customizable; each axis scored 1–10 plus a short justification. -- `timeout_seconds`: hard cap per `claude -p` run (default **3600**), overridable. - ---- - -## 5. Data flow — one sample (one arm, one repetition) - -1. **Worktree:** `git worktree add /.archie/benchmark/worktrees/- ` → fresh isolated checkout. Every repetition gets its own worktree (Claude mutates files; worktrees are not shared). -2. **Run** in the worktree (`cwd=`): - ``` - claude -p "" \ - --model \ - --output-format stream-json --verbose \ - --permission-mode acceptEdits - ``` - Both arms get **identical** harness flags. The treatment arm picks up Archie hooks from the repo's `.claude/settings` and auto-loads `CLAUDE.md`; the control arm has neither — that is the measured difference. -3. **Metrics** (`metrics.py`) from the stream-json events: - - **tool_calls**: count of `tool_use` blocks in `assistant` messages, **also broken down by type** (Edit / Read / Bash / …). - - **tokens**: from the final `result` event `usage`: `input`, `output`, `cache_read`, `cache_creation`. - - **cost**: `result.total_cost_usd`. - - **duration**: `result.duration_ms` (plus our own wall-clock as a sanity check). - - **turns**: `result.num_turns`. - - **completed**: `result.subtype == "success"` (not timeout/error). -4. **Diff:** in the worktree, `git add -A && git diff --cached` → full change-set text, stored for the judge (kept after worktree removal). -5. **Cleanup:** `git worktree remove --force` (in `finally`). - -Raw stream-json optionally saved to `.archie/benchmark//-.jsonl` for debugging. - ---- - -## 6. `auto` command — from a plain repo to finished numbers - -Entry: `python3 -m archie.benchmark auto --prompt "..."` (or task from a config file). The tool drives the whole flow: - -1. **Check:** repo is a clean git working tree (else stop, so uncommitted state never contaminates measurement). Record the base commit/branch. -2. **Control branch** (`archie-bench/no-archie`): branch off the base. If Archie files exist (`CLAUDE.md`, `AGENTS.md`, `.claude/`, `.archie/`, per-folder `CLAUDE.md`s), **delete and commit** them. If absent, leave untouched. -3. **Treatment branch** (`archie-bench/with-archie`): branch off the base. - - If Archie files already exist → use as-is. - - If absent → **semi-automatic prep**: the tool runs `npx @bitraptors/archie ` (installs scripts + commands), then **pauses** with instructions: *"Open Claude Code on this worktree, run `/archie-deep-scan`, commit the results, then press Enter."* The user runs it interactively (more robust than headless deep-scan), returns, presses Enter. The tool **verifies** the Archie files now exist (fails clearly if not) and commits anything uncommitted. - - The deep-scan cost is **excluded from measurement**. Because prep is interactive (semi-automatic), the tool cannot directly meter its token cost; `prep_cost_usd` is **best-effort and nullable** — if `.archie/telemetry/` from the deep-scan run is present, the tool reads duration/cost from it, otherwise the field stays null. The point is only that prep is never folded into sample metrics. -4. **Benchmark:** the `run` flow on both branches (default 3 repetitions, blind judge). -5. **Aggregate + Supabase write + console summary.** - -Idempotent branch prep: if a `archie-bench/*` branch already exists, the tool asks (reuse / regenerate / abort) — never silently overwrites. - ---- - -## 7. Blind judge-Claude - -- The judge is a **separate `claude -p` call** with fresh context (does not see the benchmark runs or Archie). -- Input: the `task_prompt` + both arms' diffs labeled **"Variant A" / "Variant B" in a randomized order** (the tool records the mapping). The judge cannot tell which is the Archie arm → no bias. -- Randomization uses a **fixed seed** derived from the sample id (no time/`random`-without-seed dependence), stored as `judge_seed` for reproducibility. -- Output is **forced JSON**: per-rubric-axis score 1–10 + short justification + overall score. `judge.py` validates; on malformed JSON it retries **once**. -- Scoring is **pairwise per repetition**: each (A, B) pair → one judge call (N calls, not N²). Per-arm judge scores are averaged. -- `judge.model` defaults to Opus (stronger judge), overridable in config. - ---- - -## 8. Supabase schema - -Two tables in the existing project, written directly via PostgREST with the **service key** (`.env`: `SUPABASE_URL`, `SUPABASE_SERVICE_KEY`). - -**`benchmark_runs`** — one row per benchmark run: - -``` -id uuid pk (default gen_random_uuid()) -name text -- config.name -repo_name text -- repo basename only (not full path) -task_prompt text -model text -judge_model text -repetitions int -git_base_commit text -- base commit (reproducibility) -prep_cost_usd numeric null -- deep-scan prep cost, SEPARATE & best-effort (null if not metered) -archie_version text -created_at timestamptz default now() -``` - -**`benchmark_samples`** — one row per (arm × repetition): - -``` -id uuid pk -run_id uuid fk -> benchmark_runs.id -arm text -- 'control' | 'treatment' -repetition int -tool_calls int -tool_breakdown jsonb -- {"Edit":4,"Read":9,"Bash":2,...} -input_tokens int -output_tokens int -cache_read_tokens int -cache_creation_tokens int -cost_usd numeric -duration_ms int -num_turns int -completed bool -- result.subtype == success -quality_score numeric null -- judge overall (0–10) -quality_detail jsonb null -- per-axis breakdown + justification -judge_seed int -created_at timestamptz default now() -``` - -- Aggregates (per-arm mean/spread/savings-%) are **not** stored twice — a DB **view** `benchmark_summary` computes them from samples; the website (separate spec) reads the view. -- DDL ships as versioned `archie/benchmark/schema.sql` (run against Supabase manually / in CI). -- `store.py`: if `.env` keys are missing → **does not crash**; saves locally to `.archie/benchmark//results.json` and warns (offline mode). - ---- - -## 9. Error handling, isolation safety, cleanup - -- **Worktree-leak protection:** every worktree is created and removed in `try/finally` (`git worktree remove --force`). `git worktree prune` at run start and end. Temp root is a known location (`/.archie/benchmark/worktrees/`) so leftovers are cleanable on restart. -- **One failed sample does not sink the run:** if a `claude -p` times out/errors, that sample is recorded with `completed=false` and partial metrics; others continue. The aggregate reports how many samples dropped. -- **Fairness guards:** the tool verifies (a) `task_prompt` is byte-identical across arms, (b) `model` and harness flags are identical, (c) both branches descend from the same `git_base_commit`. Any violation → stop, do not write noisy data. -- **Prep separation:** deep-scan prep happens entirely before measurement, on a separate branch; measured `claude -p` runs start from fresh worktrees where Archie files are already committed — prep tokens/time never leak into sample metrics. -- **Secrets:** `.env` is never logged; only `repo_name` (basename) goes to the DB, not the full path. - ---- - -## 10. Testing - -`tests/benchmark/`, pytest, all external calls mocked (no real `claude`/Supabase in tests): - -- `metrics.py`: fixed stream-json fixtures (success, timeout, tool-heavy, zero-tool) → correct tool count, token sums, completed flag. -- `config.py`: valid/invalid configs, missing fields, identical-prompt invariant. -- `isolation.py`: worktree add/remove on a throwaway temp git repo (may run real git, fast). -- `diff.py`: known change → expected diff text, untracked files included. -- `judge.py`: mocked judge response parse, malformed JSON → 1 retry, seed determinism. -- `store.py`: mocked HTTP → correct payload shape; missing `.env` → offline fallback file. -- `orchestrator.py`: end-to-end with mocks (fake runner+judge+store) → matrix and aggregation correct; a failed sample does not sink the run. - -**Edge cases covered:** empty diff (Claude did nothing) → `completed=true` but low quality; mid-run timeout; both arms identical; missing Supabase key; pre-existing benchmark branch; non-clean working tree; missing Archie files after deep-scan prep (verification fails). - ---- - -## 11. Out of scope (explicit) - -- Website / dashboard display of results — **separate follow-up spec** (will read the `benchmark_summary` view). -- End-user-facing shipped benchmark (consent gating, anonymization, edge-function ingest). -- Anthropic Agent SDK / raw API execution paths (headless Claude Code only). -- Automatic headless deep-scan (semi-automatic interactive prep chosen instead). diff --git a/docs/specs/2026-06-02-archie-benchmark-harness-plan.md b/docs/specs/2026-06-02-archie-benchmark-harness-plan.md deleted file mode 100644 index b7615de9..00000000 --- a/docs/specs/2026-06-02-archie-benchmark-harness-plan.md +++ /dev/null @@ -1,1899 +0,0 @@ -# Archie Benchmark Harness Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Build an internal Python tool that runs an identical headless Claude Code task on a control branch (no Archie) and a treatment branch (full Archie docs+hooks), captures tool calls / tokens / cost / time + a blind judge-Claude quality score, and stores results in Supabase. - -**Architecture:** Zero-dep stdlib Python package `archie/benchmark/`. Each module has one responsibility: config parsing, git-worktree isolation, headless `claude -p` execution, stream-json metric extraction, diff capture, blind judge scoring, Supabase write, and an orchestrator that runs the (arm × repetition) matrix. External side effects (the `claude` CLI, git, Supabase HTTP) are isolated behind functions that accept injectable dependencies so tests mock them. - -**Tech Stack:** Python 3.9+ (stdlib only — `json`, `subprocess`, `urllib`, `dataclasses`, `pathlib`, `contextlib`, `hashlib`, `argparse`), pytest, Claude Code CLI (`claude -p`), Supabase PostgREST. - -**Spec:** `docs/specs/2026-06-02-archie-benchmark-harness-design.md` - ---- - -## File Structure - -``` -archie/benchmark/ - __init__.py # package marker, exports - config.py # BenchmarkConfig + JudgeConfig dataclasses, load/parse/validate - metrics.py # SampleMetrics dataclass + parse_stream(lines) - diff.py # capture_diff(worktree_path) -> str - isolation.py # worktree() contextmanager + prune() - runner.py # run_claude(...) -> (SampleMetrics, raw_stdout) - judge.py # run_judge(...) -> per-arm rubric scores (blind, seeded A/B) - store.py # store_results(...) -> Supabase write or offline fallback - aggregate.py # aggregate_samples(samples) -> per-arm means/spread - orchestrator.py # run_benchmark(config, deps...) + prepare_branches(...) - cli.py # argparse entry: run / auto / prep - schema.sql # versioned Supabase DDL (tables + summary view) -tests/benchmark/ - __init__.py - test_config.py - test_metrics.py - test_diff.py - test_isolation.py - test_runner.py - test_judge.py - test_store.py - test_aggregate.py - test_orchestrator.py -``` - -**Conventions to follow (from existing `archie/standalone/`):** zero third-party imports, `subprocess.run(..., capture_output=True, text=True)`, defensive `.get()` on parsed JSON, no secrets in logs. - ---- - -## Shared Type Contracts (defined once, used everywhere) - -These exact shapes are used across tasks — keep names identical. - -- `BenchmarkConfig`: `name:str, repo:Path, task_prompt:str, model:str, branches:dict{"treatment":str,"control":str}, repetitions:int, judge:JudgeConfig, timeout_seconds:int` -- `JudgeConfig`: `model:str, rubric:list[str]` -- `SampleMetrics`: `tool_calls:int, tool_breakdown:dict[str,int], input_tokens:int, output_tokens:int, cache_read_tokens:int, cache_creation_tokens:int, cost_usd:float, duration_ms:int, num_turns:int, completed:bool` -- Judge result dict: `{"treatment": {:int,..,"overall":float,"justification":str}, "control": {...}, "seed": int}` -- Sample row dict (for store): `{arm, repetition, tool_calls, tool_breakdown, input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens, cost_usd, duration_ms, num_turns, completed, quality_score, quality_detail, judge_seed}` - ---- - -### Task 1: Package scaffold + config - -**Files:** -- Create: `archie/benchmark/__init__.py` -- Create: `archie/benchmark/config.py` -- Create: `tests/benchmark/__init__.py` -- Test: `tests/benchmark/test_config.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_config.py -import json -import pytest -from pathlib import Path -from archie.benchmark.config import parse_config, load_config, BenchmarkConfig - - -def _valid(): - return { - "name": "demo", - "repo": "/tmp/repo", - "task_prompt": "Add a feature", - "model": "claude-sonnet-4-6", - } - - -def test_parse_minimal_applies_defaults(): - cfg = parse_config(_valid()) - assert isinstance(cfg, BenchmarkConfig) - assert cfg.repetitions == 3 - assert cfg.timeout_seconds == 3600 - assert cfg.branches == {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"} - assert cfg.judge.model == "claude-opus-4-8" - assert "correctness" in cfg.judge.rubric - assert isinstance(cfg.repo, Path) - - -def test_parse_overrides(): - data = _valid() - data.update({ - "repetitions": 5, - "timeout_seconds": 1200, - "branches": {"treatment": "t", "control": "c"}, - "judge": {"model": "m", "rubric": ["x"]}, - }) - cfg = parse_config(data) - assert cfg.repetitions == 5 - assert cfg.timeout_seconds == 1200 - assert cfg.branches == {"treatment": "t", "control": "c"} - assert cfg.judge.model == "m" - assert cfg.judge.rubric == ["x"] - - -@pytest.mark.parametrize("missing", ["name", "repo", "task_prompt", "model"]) -def test_missing_required_raises(missing): - data = _valid() - del data[missing] - with pytest.raises(ValueError, match="required"): - parse_config(data) - - -def test_repetitions_must_be_positive(): - data = _valid() - data["repetitions"] = 0 - with pytest.raises(ValueError, match="repetitions"): - parse_config(data) - - -def test_branches_missing_arm_raises(): - data = _valid() - data["branches"] = {"treatment": "t"} - with pytest.raises(ValueError, match="control"): - parse_config(data) - - -def test_load_config_reads_file(tmp_path): - p = tmp_path / "c.json" - p.write_text(json.dumps(_valid())) - cfg = load_config(p) - assert cfg.name == "demo" -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_config.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.config'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/__init__.py -"""Internal Archie effectiveness benchmark harness (not shipped via npm).""" -``` - -```python -# archie/benchmark/config.py -import json -from dataclasses import dataclass, field -from pathlib import Path - -DEFAULT_JUDGE_MODEL = "claude-opus-4-8" -DEFAULT_RUBRIC = ["correctness", "completeness", "follows_conventions", "no_regressions"] -DEFAULT_BRANCHES = {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"} -DEFAULT_TIMEOUT = 3600 -DEFAULT_REPETITIONS = 3 -REQUIRED = ("name", "repo", "task_prompt", "model") - - -@dataclass -class JudgeConfig: - model: str = DEFAULT_JUDGE_MODEL - rubric: list = field(default_factory=lambda: list(DEFAULT_RUBRIC)) - - -@dataclass -class BenchmarkConfig: - name: str - repo: Path - task_prompt: str - model: str - branches: dict = field(default_factory=lambda: dict(DEFAULT_BRANCHES)) - repetitions: int = DEFAULT_REPETITIONS - judge: JudgeConfig = field(default_factory=JudgeConfig) - timeout_seconds: int = DEFAULT_TIMEOUT - - -def parse_config(data): - missing = [k for k in REQUIRED if k not in data or data[k] in (None, "")] - if missing: - raise ValueError(f"config missing required fields: {', '.join(missing)}") - - branches = data.get("branches", dict(DEFAULT_BRANCHES)) - for arm in ("treatment", "control"): - if arm not in branches or not branches[arm]: - raise ValueError(f"config.branches missing '{arm}'") - - reps = int(data.get("repetitions", DEFAULT_REPETITIONS)) - if reps < 1: - raise ValueError("repetitions must be >= 1") - - jd = data.get("judge", {}) or {} - judge = JudgeConfig( - model=jd.get("model") or DEFAULT_JUDGE_MODEL, - rubric=jd.get("rubric") or list(DEFAULT_RUBRIC), - ) - - return BenchmarkConfig( - name=data["name"], - repo=Path(data["repo"]).expanduser(), - task_prompt=data["task_prompt"], - model=data["model"], - branches={"treatment": branches["treatment"], "control": branches["control"]}, - repetitions=reps, - judge=judge, - timeout_seconds=int(data.get("timeout_seconds", DEFAULT_TIMEOUT)), - ) - - -def load_config(path): - return parse_config(json.loads(Path(path).read_text())) -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_config.py -v` -Expected: PASS (all 9 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/__init__.py archie/benchmark/config.py tests/benchmark/__init__.py tests/benchmark/test_config.py -git commit -m "feat(benchmark): config dataclasses + JSON parsing/validation" -``` - ---- - -### Task 2: Stream-json metric extraction - -**Files:** -- Create: `archie/benchmark/metrics.py` -- Test: `tests/benchmark/test_metrics.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_metrics.py -import json -from archie.benchmark.metrics import parse_stream, SampleMetrics - - -def _assistant(blocks): - return json.dumps({"type": "assistant", "message": {"content": blocks}}) - - -def _tool_use(name): - return {"type": "tool_use", "name": name, "id": "x", "input": {}} - - -def _result(subtype="success"): - return json.dumps({ - "type": "result", - "subtype": subtype, - "total_cost_usd": 0.1234, - "duration_ms": 5000, - "num_turns": 7, - "usage": { - "input_tokens": 100, - "output_tokens": 200, - "cache_read_input_tokens": 50, - "cache_creation_input_tokens": 25, - }, - }) - - -def test_counts_tools_and_breakdown(): - lines = [ - json.dumps({"type": "system", "subtype": "init"}), - _assistant([{"type": "text", "text": "hi"}, _tool_use("Read")]), - _assistant([_tool_use("Edit"), _tool_use("Edit")]), - _result(), - ] - m = parse_stream(lines) - assert m.tool_calls == 3 - assert m.tool_breakdown == {"Read": 1, "Edit": 2} - - -def test_extracts_result_fields(): - m = parse_stream([_result()]) - assert m.input_tokens == 100 - assert m.output_tokens == 200 - assert m.cache_read_tokens == 50 - assert m.cache_creation_tokens == 25 - assert m.cost_usd == 0.1234 - assert m.duration_ms == 5000 - assert m.num_turns == 7 - assert m.completed is True - - -def test_error_result_not_completed(): - m = parse_stream([_result(subtype="error_max_turns")]) - assert m.completed is False - - -def test_zero_tool_run(): - m = parse_stream([_assistant([{"type": "text", "text": "done"}]), _result()]) - assert m.tool_calls == 0 - assert m.tool_breakdown == {} - - -def test_ignores_blank_and_malformed_lines(): - m = parse_stream(["", " ", "not json", _result()]) - assert m.completed is True - - -def test_no_result_event_defaults(): - m = parse_stream([_assistant([_tool_use("Bash")])]) - assert m.tool_calls == 1 - assert m.completed is False - assert m.cost_usd == 0.0 -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_metrics.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.metrics'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/metrics.py -import json -from dataclasses import dataclass, field - - -@dataclass -class SampleMetrics: - tool_calls: int = 0 - tool_breakdown: dict = field(default_factory=dict) - input_tokens: int = 0 - output_tokens: int = 0 - cache_read_tokens: int = 0 - cache_creation_tokens: int = 0 - cost_usd: float = 0.0 - duration_ms: int = 0 - num_turns: int = 0 - completed: bool = False - - -def parse_stream(lines): - m = SampleMetrics() - for line in lines: - line = line.strip() - if not line: - continue - try: - ev = json.loads(line) - except json.JSONDecodeError: - continue - etype = ev.get("type") - if etype == "assistant": - for block in ev.get("message", {}).get("content", []) or []: - if isinstance(block, dict) and block.get("type") == "tool_use": - m.tool_calls += 1 - name = block.get("name", "unknown") - m.tool_breakdown[name] = m.tool_breakdown.get(name, 0) + 1 - elif etype == "result": - usage = ev.get("usage", {}) or {} - m.input_tokens = usage.get("input_tokens", 0) - m.output_tokens = usage.get("output_tokens", 0) - m.cache_read_tokens = usage.get("cache_read_input_tokens", 0) - m.cache_creation_tokens = usage.get("cache_creation_input_tokens", 0) - m.cost_usd = ev.get("total_cost_usd", 0.0) - m.duration_ms = ev.get("duration_ms", 0) - m.num_turns = ev.get("num_turns", 0) - m.completed = ev.get("subtype") == "success" - return m -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_metrics.py -v` -Expected: PASS (6 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/metrics.py tests/benchmark/test_metrics.py -git commit -m "feat(benchmark): stream-json metric extraction (tools, tokens, cost, completed)" -``` - ---- - -### Task 3: Diff capture - -**Files:** -- Create: `archie/benchmark/diff.py` -- Test: `tests/benchmark/test_diff.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_diff.py -import subprocess -from archie.benchmark.diff import capture_diff - - -def _git(args, cwd): - subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) - - -def _init_repo(path): - _git(["init"], path) - _git(["config", "user.email", "t@t.t"], path) - _git(["config", "user.name", "t"], path) - (path / "a.txt").write_text("one\n") - _git(["add", "-A"], path) - _git(["commit", "-m", "init"], path) - - -def test_captures_modified_and_untracked(tmp_path): - _init_repo(tmp_path) - (tmp_path / "a.txt").write_text("one\ntwo\n") # modified, tracked - (tmp_path / "b.txt").write_text("new file\n") # untracked - diff = capture_diff(tmp_path) - assert "a.txt" in diff - assert "two" in diff - assert "b.txt" in diff - assert "new file" in diff - - -def test_empty_diff_when_no_changes(tmp_path): - _init_repo(tmp_path) - diff = capture_diff(tmp_path) - assert diff.strip() == "" -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_diff.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.diff'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/diff.py -import subprocess - - -def capture_diff(worktree_path): - """Stage everything (so untracked files show) and return the cached diff text.""" - subprocess.run(["git", "add", "-A"], cwd=str(worktree_path), - check=True, capture_output=True, text=True) - result = subprocess.run(["git", "diff", "--cached"], cwd=str(worktree_path), - check=True, capture_output=True, text=True) - return result.stdout -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_diff.py -v` -Expected: PASS (2 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/diff.py tests/benchmark/test_diff.py -git commit -m "feat(benchmark): capture full diff (modified + untracked) from a worktree" -``` - ---- - -### Task 4: Git worktree isolation - -**Files:** -- Create: `archie/benchmark/isolation.py` -- Test: `tests/benchmark/test_isolation.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_isolation.py -import subprocess -from pathlib import Path -from archie.benchmark.isolation import worktree, prune - - -def _git(args, cwd): - subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) - - -def _init_repo(path): - _git(["init"], path) - _git(["config", "user.email", "t@t.t"], path) - _git(["config", "user.name", "t"], path) - (path / "a.txt").write_text("one\n") - _git(["add", "-A"], path) - _git(["commit", "-m", "init"], path) - _git(["branch", "feature"], path) - - -def test_worktree_created_and_removed(tmp_path): - repo = tmp_path / "repo" - repo.mkdir() - _init_repo(repo) - dest = tmp_path / "wt" - with worktree(repo, "feature", dest) as wt: - assert Path(wt).exists() - assert (Path(wt) / "a.txt").exists() - assert not Path(dest).exists() - - -def test_worktree_removed_on_exception(tmp_path): - repo = tmp_path / "repo" - repo.mkdir() - _init_repo(repo) - dest = tmp_path / "wt" - try: - with worktree(repo, "feature", dest): - raise RuntimeError("boom") - except RuntimeError: - pass - assert not Path(dest).exists() - - -def test_prune_runs_without_error(tmp_path): - repo = tmp_path / "repo" - repo.mkdir() - _init_repo(repo) - prune(repo) # must not raise -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_isolation.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.isolation'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/isolation.py -import subprocess -from contextlib import contextmanager -from pathlib import Path - - -@contextmanager -def worktree(repo_path, branch, dest): - """Create a git worktree for `branch` at `dest`, always removed on exit.""" - dest = Path(dest) - subprocess.run(["git", "worktree", "add", "--force", str(dest), branch], - cwd=str(repo_path), check=True, capture_output=True, text=True) - try: - yield dest - finally: - subprocess.run(["git", "worktree", "remove", "--force", str(dest)], - cwd=str(repo_path), capture_output=True, text=True) - - -def prune(repo_path): - subprocess.run(["git", "worktree", "prune"], - cwd=str(repo_path), capture_output=True, text=True) -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_isolation.py -v` -Expected: PASS (3 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/isolation.py tests/benchmark/test_isolation.py -git commit -m "feat(benchmark): git worktree contextmanager with guaranteed cleanup" -``` - ---- - -### Task 5: Headless claude runner - -**Files:** -- Create: `archie/benchmark/runner.py` -- Test: `tests/benchmark/test_runner.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_runner.py -import json -import subprocess -import pytest -from archie.benchmark import runner - - -def _stream(): - return "\n".join([ - json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}), - json.dumps({"type": "result", "subtype": "success", "total_cost_usd": 0.5, - "duration_ms": 1000, "num_turns": 2, - "usage": {"input_tokens": 10, "output_tokens": 20}}), - ]) - - -def test_run_claude_parses_metrics(monkeypatch): - captured = {} - - def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None): - captured["cmd"] = cmd - captured["cwd"] = cwd - return subprocess.CompletedProcess(cmd, 0, stdout=_stream(), stderr="") - - monkeypatch.setattr(runner.subprocess, "run", fake_run) - metrics, raw = runner.run_claude("do it", "claude-sonnet-4-6", "/tmp/wt", 60) - - assert metrics.tool_calls == 1 - assert metrics.cost_usd == 0.5 - assert metrics.completed is True - assert captured["cwd"] == "/tmp/wt" - # identical, fair harness flags must always be present - assert captured["cmd"][:2] == ["claude", "-p"] - assert "--permission-mode" in captured["cmd"] - assert "acceptEdits" in captured["cmd"] - assert "stream-json" in captured["cmd"] - assert "--model" in captured["cmd"] and "claude-sonnet-4-6" in captured["cmd"] - - -def test_timeout_marks_incomplete(monkeypatch): - def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None): - raise subprocess.TimeoutExpired(cmd, timeout, output=_stream()) - - monkeypatch.setattr(runner.subprocess, "run", fake_run) - metrics, raw = runner.run_claude("do it", "m", "/tmp/wt", 1) - assert metrics.completed is False - assert metrics.tool_calls == 1 # partial stdout still parsed -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_runner.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.runner'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/runner.py -import subprocess -from .metrics import parse_stream - - -def run_claude(prompt, model, cwd, timeout_seconds): - """Run a headless Claude Code session in `cwd`; return (SampleMetrics, raw_stdout). - - Both benchmark arms must call this with identical flags — the only difference - between arms is the on-disk files (CLAUDE.md / .claude hooks), never the flags. - """ - cmd = [ - "claude", "-p", prompt, - "--model", model, - "--output-format", "stream-json", "--verbose", - "--permission-mode", "acceptEdits", - ] - try: - proc = subprocess.run(cmd, cwd=str(cwd), capture_output=True, - text=True, timeout=timeout_seconds) - metrics = parse_stream(proc.stdout.splitlines()) - return metrics, proc.stdout - except subprocess.TimeoutExpired as e: - partial = e.output or "" - if isinstance(partial, bytes): - partial = partial.decode("utf-8", "replace") - metrics = parse_stream(partial.splitlines()) - metrics.completed = False - return metrics, partial -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_runner.py -v` -Expected: PASS (2 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/runner.py tests/benchmark/test_runner.py -git commit -m "feat(benchmark): headless claude -p runner with timeout handling" -``` - ---- - -### Task 6: Blind judge - -**Files:** -- Create: `archie/benchmark/judge.py` -- Test: `tests/benchmark/test_judge.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_judge.py -import json -import pytest -from archie.benchmark import judge - - -def test_assign_order_is_seed_deterministic(): - assert judge.assign_order(0) == ("a", "b") - assert judge.assign_order(2) == ("a", "b") - assert judge.assign_order(1) == ("b", "a") - assert judge.assign_order(3) == ("b", "a") - - -def test_parse_judge_output_extracts_embedded_json(): - text = 'Here is my verdict:\n{"variant_a": {"overall": 8}, "variant_b": {"overall": 5}}\nThanks' - parsed = judge.parse_judge_output(text) - assert parsed["variant_a"]["overall"] == 8 - - -def test_parse_judge_output_raises_without_json(): - with pytest.raises(ValueError, match="JSON"): - judge.parse_judge_output("no json here") - - -def test_run_judge_maps_variants_to_arms_seed_even(): - # seed even -> treatment is variant_a - payload = json.dumps({"variant_a": {"overall": 9}, "variant_b": {"overall": 4}}) - calls = [] - - def fake_runner(prompt, model, timeout): - calls.append((prompt, model)) - return payload - - result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF", - rubric=["correctness"], model="m", seed=0, - _runner=fake_runner) - assert result["treatment"]["overall"] == 9 - assert result["control"]["overall"] == 4 - assert result["seed"] == 0 - # variant A diff (shown first) must be the treatment diff for an even seed - assert calls[0][0].index("TREAT_DIFF") < calls[0][0].index("CTRL_DIFF") - - -def test_run_judge_maps_variants_to_arms_seed_odd(): - # seed odd -> treatment is variant_b - payload = json.dumps({"variant_a": {"overall": 3}, "variant_b": {"overall": 7}}) - result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF", - rubric=["correctness"], model="m", seed=1, - _runner=lambda p, m, t: payload) - assert result["treatment"]["overall"] == 7 - assert result["control"]["overall"] == 3 - - -def test_run_judge_retries_once_on_bad_json(): - outputs = ["garbage", json.dumps({"variant_a": {"overall": 6}, "variant_b": {"overall": 6}})] - - def flaky(prompt, model, timeout): - return outputs.pop(0) - - result = judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0, _runner=flaky) - assert result["treatment"]["overall"] == 6 - assert outputs == [] # both outputs consumed -> retried exactly once - - -def test_run_judge_raises_after_two_failures(): - with pytest.raises(ValueError): - judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0, - _runner=lambda p, m, t: "still garbage") -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_judge.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.judge'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/judge.py -import json -import subprocess - - -def assign_order(seed): - """Return (treatment_variant, control_variant) — blind A/B label assignment.""" - return ("a", "b") if seed % 2 == 0 else ("b", "a") - - -def build_judge_prompt(task_prompt, diff_a, diff_b, rubric): - axes = ", ".join(rubric) - schema = ('{"variant_a": {' + ", ".join(f'"{a}": int' for a in rubric) - + ', "overall": number, "justification": string}, "variant_b": {... same keys ...}}') - return ( - "You are an impartial senior code reviewer. Two AI agents independently " - "attempted the SAME task. You are shown each agent's diff as an anonymous " - "variant. Judge purely on the code; you do not know anything about how each " - "was produced.\n\n" - f"TASK GIVEN TO BOTH AGENTS:\n{task_prompt}\n\n" - f"Score each variant on these axes (each 1-10): {axes}. Also give an " - "'overall' score (0-10) and a one-sentence 'justification'.\n\n" - f"Respond with ONLY a JSON object of this exact shape:\n{schema}\n\n" - f"=== VARIANT A DIFF ===\n{diff_a}\n\n" - f"=== VARIANT B DIFF ===\n{diff_b}\n" - ) - - -def parse_judge_output(text): - start = text.find("{") - end = text.rfind("}") - if start == -1 or end == -1 or end < start: - raise ValueError("no JSON object found in judge output") - return json.loads(text[start:end + 1]) - - -def _default_runner(prompt, model, timeout): - proc = subprocess.run( - ["claude", "-p", prompt, "--model", model, "--output-format", "text"], - capture_output=True, text=True, timeout=timeout, - ) - return proc.stdout - - -def run_judge(task_prompt, treatment_diff, control_diff, rubric, model, seed, - timeout_seconds=600, _runner=None): - t_variant, c_variant = assign_order(seed) - diff_a = treatment_diff if t_variant == "a" else control_diff - diff_b = treatment_diff if t_variant == "b" else control_diff - prompt = build_judge_prompt(task_prompt, diff_a, diff_b, rubric) - - runner = _runner or _default_runner - parsed = None - last_err = None - for _ in range(2): - try: - parsed = parse_judge_output(runner(prompt, model, timeout_seconds)) - break - except (ValueError, json.JSONDecodeError) as e: - last_err = e - if parsed is None: - raise ValueError(f"judge returned unparseable output twice: {last_err}") - - return { - "treatment": parsed["variant_a"] if t_variant == "a" else parsed["variant_b"], - "control": parsed["variant_a"] if c_variant == "a" else parsed["variant_b"], - "seed": seed, - } -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_judge.py -v` -Expected: PASS (7 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/judge.py tests/benchmark/test_judge.py -git commit -m "feat(benchmark): blind seeded judge with A/B randomization + retry" -``` - ---- - -### Task 7: Aggregation - -**Files:** -- Create: `archie/benchmark/aggregate.py` -- Test: `tests/benchmark/test_aggregate.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_aggregate.py -from archie.benchmark.aggregate import aggregate_samples - - -def _s(arm, cost, tools, quality, completed=True): - return {"arm": arm, "cost_usd": cost, "tool_calls": tools, - "duration_ms": 1000, "input_tokens": 10, "output_tokens": 20, - "quality_score": quality, "completed": completed} - - -def test_per_arm_means(): - samples = [ - _s("treatment", 1.0, 10, 8.0), - _s("treatment", 3.0, 20, 9.0), - _s("control", 2.0, 30, 6.0), - _s("control", 4.0, 40, 7.0), - ] - agg = aggregate_samples(samples) - assert agg["treatment"]["cost_usd_mean"] == 2.0 - assert agg["treatment"]["tool_calls_mean"] == 15.0 - assert agg["treatment"]["quality_mean"] == 8.5 - assert agg["control"]["cost_usd_mean"] == 3.0 - assert agg["treatment"]["n"] == 2 - assert agg["treatment"]["completed_n"] == 2 - - -def test_savings_percentages(): - samples = [_s("treatment", 1.0, 10, 8.0), _s("control", 2.0, 20, 8.0)] - agg = aggregate_samples(samples) - # treatment cost is 50% lower than control - assert agg["savings"]["cost_pct"] == 50.0 - assert agg["savings"]["tool_calls_pct"] == 50.0 - - -def test_quality_ignores_none_scores(): - samples = [ - _s("treatment", 1.0, 10, None, completed=False), - _s("treatment", 1.0, 10, 8.0), - _s("control", 1.0, 10, 6.0), - ] - agg = aggregate_samples(samples) - assert agg["treatment"]["quality_mean"] == 8.0 # None excluded - assert agg["treatment"]["completed_n"] == 1 - - -def test_handles_empty_arm(): - samples = [_s("treatment", 1.0, 10, 8.0)] - agg = aggregate_samples(samples) - assert agg["control"]["n"] == 0 - assert agg["control"]["cost_usd_mean"] is None - assert agg["savings"]["cost_pct"] is None -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_aggregate.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.aggregate'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/aggregate.py -NUMERIC_FIELDS = ["cost_usd", "tool_calls", "duration_ms", "input_tokens", "output_tokens"] - - -def _mean(values): - return sum(values) / len(values) if values else None - - -def _arm_stats(samples): - stats = {"n": len(samples), "completed_n": sum(1 for s in samples if s.get("completed"))} - for f in NUMERIC_FIELDS: - vals = [s[f] for s in samples if s.get(f) is not None] - stats[f + "_mean"] = _mean(vals) - qvals = [s["quality_score"] for s in samples if s.get("quality_score") is not None] - stats["quality_mean"] = _mean(qvals) - return stats - - -def _pct_lower(treatment, control): - """Percent reduction of treatment relative to control (positive = treatment cheaper).""" - if treatment is None or control is None or control == 0: - return None - return round((control - treatment) / control * 100, 1) - - -def aggregate_samples(samples): - treatment = [s for s in samples if s.get("arm") == "treatment"] - control = [s for s in samples if s.get("arm") == "control"] - t_stats = _arm_stats(treatment) - c_stats = _arm_stats(control) - return { - "treatment": t_stats, - "control": c_stats, - "savings": { - "cost_pct": _pct_lower(t_stats["cost_usd_mean"], c_stats["cost_usd_mean"]), - "tool_calls_pct": _pct_lower(t_stats["tool_calls_mean"], c_stats["tool_calls_mean"]), - "duration_pct": _pct_lower(t_stats["duration_ms_mean"], c_stats["duration_ms_mean"]), - }, - } -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_aggregate.py -v` -Expected: PASS (4 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/aggregate.py tests/benchmark/test_aggregate.py -git commit -m "feat(benchmark): per-arm aggregation + savings percentages" -``` - ---- - -### Task 8: Supabase store + offline fallback - -**Files:** -- Create: `archie/benchmark/store.py` -- Test: `tests/benchmark/test_store.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_store.py -import json -from archie.benchmark import store - - -def test_offline_fallback_when_env_missing(tmp_path, monkeypatch): - monkeypatch.delenv("SUPABASE_URL", raising=False) - monkeypatch.delenv("SUPABASE_SERVICE_KEY", raising=False) - out = tmp_path / "nested" / "results.json" - res = store.store_results({"name": "x"}, [{"arm": "treatment"}], out) - assert res["mode"] == "offline" - saved = json.loads(out.read_text()) - assert saved["run"]["name"] == "x" - assert saved["samples"][0]["arm"] == "treatment" - - -def test_online_write_posts_run_then_samples(tmp_path, monkeypatch): - monkeypatch.setenv("SUPABASE_URL", "https://x.supabase.co") - monkeypatch.setenv("SUPABASE_SERVICE_KEY", "secret") - calls = [] - - def fake_poster(url, key, table, rows): - calls.append((table, rows)) - if table == "benchmark_runs": - return [{"id": "run-123"}] - return rows - - res = store.store_results({"name": "x"}, [{"arm": "treatment"}, {"arm": "control"}], - tmp_path / "r.json", _poster=fake_poster) - assert res["mode"] == "online" - assert res["run_id"] == "run-123" - assert calls[0][0] == "benchmark_runs" - assert calls[1][0] == "benchmark_samples" - # run_id stamped onto every sample row - assert all(r["run_id"] == "run-123" for r in calls[1][1]) -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_store.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.store'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/store.py -import json -import os -import urllib.request -from pathlib import Path - - -def _env(): - return os.environ.get("SUPABASE_URL"), os.environ.get("SUPABASE_SERVICE_KEY") - - -def _post(url, key, table, rows): - data = json.dumps(rows).encode("utf-8") - req = urllib.request.Request( - f"{url}/rest/v1/{table}", - data=data, - headers={ - "apikey": key, - "Authorization": f"Bearer {key}", - "Content-Type": "application/json", - "Prefer": "return=representation", - }, - method="POST", - ) - with urllib.request.urlopen(req, timeout=30) as resp: - return json.loads(resp.read().decode("utf-8")) - - -def store_results(run_row, sample_rows, offline_path, _poster=None): - url, key = _env() - if not url or not key: - path = Path(offline_path) - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps({"run": run_row, "samples": sample_rows}, indent=2)) - return {"mode": "offline", "path": str(path)} - - poster = _poster or _post - created = poster(url, key, "benchmark_runs", [run_row]) - run_id = created[0]["id"] - for r in sample_rows: - r["run_id"] = run_id - poster(url, key, "benchmark_samples", sample_rows) - return {"mode": "online", "run_id": run_id} -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_store.py -v` -Expected: PASS (2 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/store.py tests/benchmark/test_store.py -git commit -m "feat(benchmark): Supabase PostgREST write with offline fallback" -``` - ---- - -### Task 9: Supabase schema DDL - -**Files:** -- Create: `archie/benchmark/schema.sql` -- Test: `tests/benchmark/test_schema.py` - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_schema.py -from pathlib import Path - -SQL = Path(__file__).parent.parent.parent / "archie" / "benchmark" / "schema.sql" - - -def test_schema_defines_both_tables_and_view(): - text = SQL.read_text() - assert "create table" in text.lower() - assert "benchmark_runs" in text - assert "benchmark_samples" in text - assert "benchmark_summary" in text - # key sample columns referenced by store.py / aggregate.py exist - for col in ["tool_calls", "tool_breakdown", "cost_usd", "quality_score", - "cache_read_tokens", "judge_seed", "completed", "arm"]: - assert col in text - # prep cost lives on the run, separate from measured samples - assert "prep_cost_usd" in text -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_schema.py -v` -Expected: FAIL with `FileNotFoundError` (schema.sql does not exist) - -- [ ] **Step 3: Write minimal implementation** - -```sql --- archie/benchmark/schema.sql --- Archie benchmark harness — Supabase schema (v1). --- Run manually against the project (or via CI). Idempotent-ish: uses IF NOT EXISTS. - -create table if not exists benchmark_runs ( - id uuid primary key default gen_random_uuid(), - name text not null, - repo_name text, -- basename only, never a full path - task_prompt text, - model text, - judge_model text, - repetitions int, - git_base_commit text, - prep_cost_usd numeric, -- deep-scan prep cost, separate & best-effort - archie_version text, - created_at timestamptz not null default now() -); - -create table if not exists benchmark_samples ( - id uuid primary key default gen_random_uuid(), - run_id uuid not null references benchmark_runs(id) on delete cascade, - arm text not null, -- 'control' | 'treatment' - repetition int, - tool_calls int, - tool_breakdown jsonb, - input_tokens int, - output_tokens int, - cache_read_tokens int, - cache_creation_tokens int, - cost_usd numeric, - duration_ms int, - num_turns int, - completed boolean, - quality_score numeric, - quality_detail jsonb, - judge_seed int, - created_at timestamptz not null default now() -); - -create index if not exists benchmark_samples_run_id_idx on benchmark_samples(run_id); - --- Per-run, per-arm rollup the website reads (separate spec). -create or replace view benchmark_summary as -select - r.id as run_id, - r.name as name, - r.repo_name as repo_name, - r.model as model, - s.arm as arm, - count(*) as samples, - count(*) filter (where s.completed) as completed_samples, - avg(s.tool_calls) as tool_calls_mean, - avg(s.cost_usd) as cost_usd_mean, - avg(s.duration_ms) as duration_ms_mean, - avg(s.input_tokens + s.output_tokens) as total_tokens_mean, - avg(s.quality_score) as quality_mean -from benchmark_runs r -join benchmark_samples s on s.run_id = r.id -group by r.id, r.name, r.repo_name, r.model, s.arm; -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_schema.py -v` -Expected: PASS (1 case) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/schema.sql tests/benchmark/test_schema.py -git commit -m "feat(benchmark): Supabase schema (runs + samples tables, summary view)" -``` - ---- - -### Task 10: Orchestrator — measurement matrix + fairness guards - -**Files:** -- Create: `archie/benchmark/orchestrator.py` -- Test: `tests/benchmark/test_orchestrator.py` - -This task assumes both branches already exist (the `run` command). Branch prep (`auto`) is Task 11. The orchestrator accepts injectable `run_fn`, `judge_fn`, `store_fn`, and `diff_fn` so the matrix is testable without invoking real claude/git/Supabase. - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_orchestrator.py -import pytest -from archie.benchmark.config import BenchmarkConfig, JudgeConfig -from archie.benchmark.metrics import SampleMetrics -from archie.benchmark import orchestrator - - -def _cfg(tmp_path, reps=2): - return BenchmarkConfig( - name="demo", repo=tmp_path, task_prompt="do it", - model="m", branches={"treatment": "t", "control": "c"}, - repetitions=reps, judge=JudgeConfig(model="jm", rubric=["correctness"]), - timeout_seconds=60, - ) - - -def _fake_run(metrics_by_branch): - seen = {"calls": []} - - def run_fn(prompt, model, cwd, timeout): - # branch name is encoded in the worktree path by the orchestrator - branch = "treatment" if "treatment" in str(cwd) else "control" - seen["calls"].append((branch, prompt, model)) - return metrics_by_branch[branch], "raw" - - return run_fn, seen - - -def test_run_benchmark_builds_matrix_and_aggregates(tmp_path, monkeypatch): - # neutralize real worktree/diff/base-commit side effects - monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "abc123") - monkeypatch.setattr(orchestrator, "_branch_base", lambda repo, b: "abc123") - - import contextlib - @contextlib.contextmanager - def fake_worktree(repo, branch, dest): - yield tmp_path / ("wt-" + branch) - monkeypatch.setattr(orchestrator, "worktree", fake_worktree) - monkeypatch.setattr(orchestrator, "prune", lambda repo: None) - - t_metrics = SampleMetrics(tool_calls=5, cost_usd=1.0, duration_ms=100, - input_tokens=10, output_tokens=20, completed=True) - c_metrics = SampleMetrics(tool_calls=12, cost_usd=3.0, duration_ms=300, - input_tokens=30, output_tokens=40, completed=True) - run_fn, seen = _fake_run({"treatment": t_metrics, "control": c_metrics}) - - judged = {"calls": 0} - def judge_fn(task, t_diff, c_diff, rubric, model, seed): - judged["calls"] += 1 - return {"treatment": {"overall": 9.0}, "control": {"overall": 5.0}, "seed": seed} - - stored = {} - def store_fn(run_row, sample_rows, offline_path): - stored["run"] = run_row - stored["samples"] = sample_rows - return {"mode": "offline", "path": str(offline_path)} - - result = orchestrator.run_benchmark( - _cfg(tmp_path, reps=2), - run_fn=run_fn, judge_fn=judge_fn, store_fn=store_fn, - diff_fn=lambda wt: f"diff:{wt}", - ) - - # 2 reps x 2 arms = 4 runs; 2 reps = 2 pairwise judge calls - assert len(seen["calls"]) == 4 - assert judged["calls"] == 2 - assert len(stored["samples"]) == 4 - # quality assigned per arm - t_samples = [s for s in stored["samples"] if s["arm"] == "treatment"] - assert all(s["quality_score"] == 9.0 for s in t_samples) - # aggregate shows treatment cheaper - assert result["aggregate"]["savings"]["cost_pct"] > 0 - # prompt identical across all runs - assert len({c[1] for c in seen["calls"]}) == 1 - - -def test_fairness_guard_rejects_divergent_base(tmp_path, monkeypatch): - monkeypatch.setattr(orchestrator, "_branch_base", - lambda repo, b: "AAA" if b == "t" else "BBB") - with pytest.raises(ValueError, match="base commit"): - orchestrator.run_benchmark(_cfg(tmp_path), run_fn=lambda *a: None, - judge_fn=lambda *a, **k: None, - store_fn=lambda *a: None, diff_fn=lambda w: "") - - -def test_failed_sample_does_not_sink_run(tmp_path, monkeypatch): - monkeypatch.setattr(orchestrator, "_branch_base", lambda repo, b: "same") - import contextlib - @contextlib.contextmanager - def fake_worktree(repo, branch, dest): - yield tmp_path / ("wt-" + branch) - monkeypatch.setattr(orchestrator, "worktree", fake_worktree) - monkeypatch.setattr(orchestrator, "prune", lambda repo: None) - - def run_fn(prompt, model, cwd, timeout): - if "treatment" in str(cwd): - raise RuntimeError("treatment crashed") - return SampleMetrics(completed=True, cost_usd=2.0), "raw" - - stored = {} - result = orchestrator.run_benchmark( - _cfg(tmp_path, reps=1), - run_fn=run_fn, - judge_fn=lambda *a, **k: {"treatment": {"overall": 0}, "control": {"overall": 5}, "seed": 0}, - store_fn=lambda r, s, p: stored.update(samples=s) or {"mode": "offline"}, - diff_fn=lambda wt: "", - ) - # treatment sample recorded as not-completed; control still present - arms = {s["arm"]: s for s in stored["samples"]} - assert arms["treatment"]["completed"] is False - assert arms["control"]["completed"] is True -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_orchestrator.py -v` -Expected: FAIL with `ModuleNotFoundError: No module named 'archie.benchmark.orchestrator'` - -- [ ] **Step 3: Write minimal implementation** - -```python -# archie/benchmark/orchestrator.py -import subprocess -import hashlib -from pathlib import Path - -from .isolation import worktree, prune -from .diff import capture_diff -from .runner import run_claude -from .judge import run_judge -from .store import store_results -from .aggregate import aggregate_samples -from .metrics import SampleMetrics - - -def _git_out(args, cwd): - return subprocess.run(["git", *args], cwd=str(cwd), check=True, - capture_output=True, text=True).stdout.strip() - - -def _base_commit(repo): - return _git_out(["rev-parse", "HEAD"], repo) - - -def _branch_base(repo, branch): - """The commit the branch resolves to (used to verify both arms share a base).""" - return _git_out(["rev-parse", branch], repo) - - -def _seed(name, repetition): - h = hashlib.sha256(f"{name}:{repetition}".encode("utf-8")).hexdigest() - return int(h[:8], 16) - - -def _worktrees_root(repo): - root = Path(repo) / ".archie" / "benchmark" / "worktrees" - root.mkdir(parents=True, exist_ok=True) - return root - - -def _run_one(cfg, branch, repetition, run_fn, diff_fn): - """Run a single (branch, repetition) sample; return (metrics, diff).""" - root = _worktrees_root(cfg.repo) - dest = root / f"{branch.replace('/', '_')}-{repetition}" - with worktree(cfg.repo, branch, dest) as wt: - try: - metrics, _raw = run_fn(cfg.task_prompt, cfg.model, wt, cfg.timeout_seconds) - except Exception: - return SampleMetrics(completed=False), "" - diff = diff_fn(wt) - return metrics, diff - - -def _sample_row(arm, repetition, metrics, quality_score, quality_detail, seed): - return { - "arm": arm, - "repetition": repetition, - "tool_calls": metrics.tool_calls, - "tool_breakdown": metrics.tool_breakdown, - "input_tokens": metrics.input_tokens, - "output_tokens": metrics.output_tokens, - "cache_read_tokens": metrics.cache_read_tokens, - "cache_creation_tokens": metrics.cache_creation_tokens, - "cost_usd": metrics.cost_usd, - "duration_ms": metrics.duration_ms, - "num_turns": metrics.num_turns, - "completed": metrics.completed, - "quality_score": quality_score, - "quality_detail": quality_detail, - "judge_seed": seed, - } - - -def run_benchmark(cfg, run_fn=run_claude, judge_fn=run_judge, - store_fn=store_results, diff_fn=capture_diff): - # Fairness guard: both arms must descend from the same base commit. - t_base = _branch_base(cfg.repo, cfg.branches["treatment"]) - c_base = _branch_base(cfg.repo, cfg.branches["control"]) - if t_base != c_base: - raise ValueError( - f"arms have divergent base commit (treatment={t_base}, control={c_base}); " - "both benchmark branches must branch from the same commit") - - prune(cfg.repo) - samples = [] - for rep in range(cfg.repetitions): - t_metrics, t_diff = _run_one(cfg, cfg.branches["treatment"], rep, run_fn, diff_fn) - c_metrics, c_diff = _run_one(cfg, cfg.branches["control"], rep, run_fn, diff_fn) - - seed = _seed(cfg.name, rep) - verdict = judge_fn(cfg.task_prompt, t_diff, c_diff, cfg.judge.rubric, - cfg.judge.model, seed) - t_q = verdict["treatment"] - c_q = verdict["control"] - samples.append(_sample_row("treatment", rep, t_metrics, - t_q.get("overall"), t_q, seed)) - samples.append(_sample_row("control", rep, c_metrics, - c_q.get("overall"), c_q, seed)) - prune(cfg.repo) - - agg = aggregate_samples(samples) - run_row = { - "name": cfg.name, - "repo_name": Path(cfg.repo).name, - "task_prompt": cfg.task_prompt, - "model": cfg.model, - "judge_model": cfg.judge.model, - "repetitions": cfg.repetitions, - "git_base_commit": _base_commit(cfg.repo), - "prep_cost_usd": None, - "archie_version": _archie_version(), - } - offline_path = Path(cfg.repo) / ".archie" / "benchmark" / cfg.name / "results.json" - store_result = store_fn(run_row, samples, offline_path) - return {"aggregate": agg, "samples": samples, "store": store_result, "run": run_row} - - -def _archie_version(): - try: - from archie import __version__ - return __version__ - except Exception: - return "unknown" -``` - -> **Note on test `test_failed_sample_does_not_sink_run`:** `_run_one` catches the runner exception and returns `SampleMetrics(completed=False)`, so the control arm still runs and the run completes. The fairness-guard test patches `_branch_base` to return equal values via the same-string lambda; the divergent test returns different strings. - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_orchestrator.py -v` -Expected: PASS (3 cases) - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/orchestrator.py tests/benchmark/test_orchestrator.py -git commit -m "feat(benchmark): orchestrator matrix, fairness guard, per-sample rows" -``` - ---- - -### Task 11: Branch prep + CLI - -**Files:** -- Modify: `archie/benchmark/orchestrator.py` (add `prepare_branches`) -- Create: `archie/benchmark/cli.py` -- Test: `tests/benchmark/test_prepare.py` - -`prepare_branches` does the pure, testable git work: verify clean tree, create control branch (stripping Archie files), create treatment branch. The interactive deep-scan pause lives in `cli.py` (calls `input()`), kept thin and out of unit tests. - -- [ ] **Step 1: Write the failing test** - -```python -# tests/benchmark/test_prepare.py -import subprocess -import pytest -from archie.benchmark.config import BenchmarkConfig, JudgeConfig -from archie.benchmark import orchestrator as orch - - -def _git(args, cwd): - subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) - - -def _repo(tmp_path, with_archie): - repo = tmp_path / "repo" - repo.mkdir() - _git(["init"], repo) - _git(["config", "user.email", "t@t.t"], repo) - _git(["config", "user.name", "t"], repo) - (repo / "src.py").write_text("print('hi')\n") - if with_archie: - (repo / "CLAUDE.md").write_text("# context\n") - (repo / ".claude").mkdir() - (repo / ".claude" / "settings.json").write_text("{}\n") - _git(["add", "-A"], repo) - _git(["commit", "-m", "init"], repo) - return repo - - -def _cfg(repo): - return BenchmarkConfig(name="d", repo=repo, task_prompt="x", model="m", - branches={"treatment": "archie-bench/with-archie", - "control": "archie-bench/no-archie"}, - repetitions=1, judge=JudgeConfig(), timeout_seconds=60) - - -def test_clean_tree_required(tmp_path): - repo = _repo(tmp_path, with_archie=True) - (repo / "dirty.txt").write_text("uncommitted\n") - with pytest.raises(ValueError, match="clean"): - orch.prepare_branches(_cfg(repo)) - - -def test_control_branch_strips_archie_files(tmp_path): - repo = _repo(tmp_path, with_archie=True) - status = orch.prepare_branches(_cfg(repo)) - # control branch checked out: Archie files gone - _git(["checkout", "archie-bench/no-archie"], repo) - assert not (repo / "CLAUDE.md").exists() - assert not (repo / ".claude").exists() - assert (repo / "src.py").exists() - assert status["archie_present"] is True - assert status["needs_deep_scan"] is False - - -def test_treatment_keeps_archie_files(tmp_path): - repo = _repo(tmp_path, with_archie=True) - orch.prepare_branches(_cfg(repo)) - _git(["checkout", "archie-bench/with-archie"], repo) - assert (repo / "CLAUDE.md").exists() - - -def test_no_archie_flags_deep_scan_needed(tmp_path): - repo = _repo(tmp_path, with_archie=False) - status = orch.prepare_branches(_cfg(repo)) - assert status["archie_present"] is False - assert status["needs_deep_scan"] is True -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `python -m pytest tests/benchmark/test_prepare.py -v` -Expected: FAIL with `AttributeError: module 'archie.benchmark.orchestrator' has no attribute 'prepare_branches'` - -- [ ] **Step 3: Write minimal implementation** - -Append to `archie/benchmark/orchestrator.py`: - -```python -ARCHIE_PATHS = ["CLAUDE.md", "AGENTS.md", ".claude", ".archie"] - - -def _is_clean(repo): - out = _git_out(["status", "--porcelain"], repo) - return out == "" - - -def _archie_present(repo): - return any((Path(repo) / p).exists() for p in ARCHIE_PATHS) - - -def _branch_exists(repo, branch): - res = subprocess.run(["git", "rev-parse", "--verify", branch], - cwd=str(repo), capture_output=True, text=True) - return res.returncode == 0 - - -def _create_branch(repo, branch, base): - if _branch_exists(repo, branch): - subprocess.run(["git", "branch", "-D", branch], cwd=str(repo), - capture_output=True, text=True) - _git_out(["branch", branch, base], repo) - - -def _strip_archie_on_branch(repo, branch): - """Check out branch, remove Archie artifacts (incl. per-folder CLAUDE.md), commit.""" - current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], repo) - _git_out(["checkout", branch], repo) - try: - # remove root-level + nested CLAUDE.md and known Archie dirs/files - subprocess.run(["git", "rm", "-r", "--quiet", "--ignore-unmatch", - *ARCHIE_PATHS], cwd=str(repo), capture_output=True, text=True) - # nested per-folder CLAUDE.md files - nested = subprocess.run(["git", "ls-files", "*/CLAUDE.md"], cwd=str(repo), - capture_output=True, text=True).stdout.split() - if nested: - subprocess.run(["git", "rm", "--quiet", "--ignore-unmatch", *nested], - cwd=str(repo), capture_output=True, text=True) - if not _is_clean(repo): - _git_out(["commit", "-m", "benchmark: strip Archie artifacts (control arm)"], repo) - finally: - _git_out(["checkout", current], repo) - - -def prepare_branches(cfg): - """Create control (no Archie) and treatment (with Archie) branches from current HEAD. - - Returns a status dict; if Archie is absent, `needs_deep_scan` is True and the - caller (cli) must run the interactive deep-scan on the treatment branch. - """ - repo = cfg.repo - if not _is_clean(repo): - raise ValueError("working tree is not clean; commit or stash before benchmarking") - - base = _base_commit(repo) - archie_present = _archie_present(repo) - - _create_branch(repo, cfg.branches["treatment"], base) - _create_branch(repo, cfg.branches["control"], base) - - if archie_present: - _strip_archie_on_branch(repo, cfg.branches["control"]) - # if absent, control already has no Archie files; treatment will be populated - # by the interactive deep-scan (cli handles the pause). - - return { - "archie_present": archie_present, - "needs_deep_scan": not archie_present, - "base": base, - "branches": cfg.branches, - } -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `python -m pytest tests/benchmark/test_prepare.py -v` -Expected: PASS (4 cases) - -- [ ] **Step 5: Write the CLI** - -```python -# archie/benchmark/cli.py -import argparse -import sys -from pathlib import Path - -from .config import load_config, parse_config -from .orchestrator import run_benchmark, prepare_branches - - -def _print_summary(result): - agg = result["aggregate"] - print("\n=== Benchmark summary ===") - for arm in ("treatment", "control"): - a = agg[arm] - print(f"[{arm}] n={a['n']} completed={a['completed_n']} " - f"cost=${_fmt(a['cost_usd_mean'])} tools={_fmt(a['tool_calls_mean'])} " - f"dur={_fmt(a['duration_ms_mean'])}ms quality={_fmt(a['quality_mean'])}") - s = agg["savings"] - print(f"[savings] cost={_fmt(s['cost_pct'])}% tools={_fmt(s['tool_calls_pct'])}% " - f"time={_fmt(s['duration_pct'])}%") - print(f"[store] {result['store']}") - - -def _fmt(v): - return "n/a" if v is None else (f"{v:.2f}" if isinstance(v, float) else str(v)) - - -def _cmd_run(args): - cfg = load_config(args.config) - result = run_benchmark(cfg) - _print_summary(result) - - -def _cmd_prep(args): - cfg = load_config(args.config) - status = prepare_branches(cfg) - if status["needs_deep_scan"]: - _interactive_deep_scan(cfg) - print(f"Branches ready: {cfg.branches}") - - -def _cmd_auto(args): - if args.config: - cfg = load_config(args.config) - else: - cfg = parse_config({"name": Path(args.repo).name, "repo": args.repo, - "task_prompt": args.prompt, "model": args.model}) - status = prepare_branches(cfg) - if status["needs_deep_scan"]: - _interactive_deep_scan(cfg) - result = run_benchmark(cfg) - _print_summary(result) - - -def _interactive_deep_scan(cfg): - treatment = cfg.branches["treatment"] - print("\n" + "=" * 70) - print("Archie not found in this repo. Semi-automatic prep:") - print(f" 1. In a terminal: git checkout {treatment}") - print(f" 2. Install Archie: npx @bitraptors/archie {cfg.repo}") - print(" 3. In Claude Code on that branch, run: /archie-deep-scan") - print(" 4. Commit the generated files.") - print("This deep-scan is NOT counted in the benchmark metrics.") - print("=" * 70) - input("Press Enter once the treatment branch has committed Archie files... ") - # verify - from .orchestrator import _git_out, _archie_present # local import to avoid cycle noise - current = _git_out(["rev-parse", "--abbrev-ref", "HEAD"], cfg.repo) - _git_out(["checkout", treatment], cfg.repo) - present = _archie_present(cfg.repo) - _git_out(["checkout", current], cfg.repo) - if not present: - print("ERROR: no Archie files found on the treatment branch. Aborting.", file=sys.stderr) - sys.exit(1) - - -def main(argv=None): - parser = argparse.ArgumentParser(prog="archie-benchmark", - description="Measure Archie effectiveness (control vs treatment).") - sub = parser.add_subparsers(dest="command", required=True) - - p_run = sub.add_parser("run", help="run benchmark on existing branches") - p_run.add_argument("config", help="path to benchmark config JSON") - p_run.set_defaults(func=_cmd_run) - - p_prep = sub.add_parser("prep", help="create/refresh benchmark branches only") - p_prep.add_argument("config", help="path to benchmark config JSON") - p_prep.set_defaults(func=_cmd_prep) - - p_auto = sub.add_parser("auto", help="prep branches then run, from a plain repo") - p_auto.add_argument("repo", nargs="?", help="repo path (when no --config)") - p_auto.add_argument("--config", help="path to benchmark config JSON") - p_auto.add_argument("--prompt", help="task prompt (when no --config)") - p_auto.add_argument("--model", default="claude-sonnet-4-6") - p_auto.set_defaults(func=_cmd_auto) - - args = parser.parse_args(argv) - args.func(args) - - -if __name__ == "__main__": - main() -``` - -- [ ] **Step 6: Add a CLI smoke test** - -```python -# append to tests/benchmark/test_prepare.py -def test_cli_run_invokes_benchmark(tmp_path, monkeypatch): - import json - from archie.benchmark import cli - repo = _repo(tmp_path, with_archie=True) - cfg_file = tmp_path / "c.json" - cfg_file.write_text(json.dumps({ - "name": "d", "repo": str(repo), "task_prompt": "x", "model": "m", - "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}, - })) - called = {} - monkeypatch.setattr(cli, "run_benchmark", - lambda cfg: called.setdefault("ran", True) or { - "aggregate": {"treatment": {"n": 0, "completed_n": 0, - "cost_usd_mean": None, "tool_calls_mean": None, - "duration_ms_mean": None, "quality_mean": None}, - "control": {"n": 0, "completed_n": 0, "cost_usd_mean": None, - "tool_calls_mean": None, "duration_ms_mean": None, - "quality_mean": None}, - "savings": {"cost_pct": None, "tool_calls_pct": None, - "duration_pct": None}}, - "store": {"mode": "offline"}}) - cli.main(["run", str(cfg_file)]) - assert called["ran"] is True -``` - -- [ ] **Step 7: Run all benchmark tests** - -Run: `python -m pytest tests/benchmark/ -v` -Expected: PASS (all tasks' tests green) - -- [ ] **Step 8: Commit** - -```bash -git add archie/benchmark/orchestrator.py archie/benchmark/cli.py tests/benchmark/test_prepare.py -git commit -m "feat(benchmark): branch prep (strip Archie for control) + CLI (run/prep/auto)" -``` - ---- - -### Task 12: Docs + full suite - -**Files:** -- Create: `archie/benchmark/README.md` -- Modify: `CLAUDE.md` (add a short "Benchmark Harness" section under Commands) - -- [ ] **Step 1: Write `archie/benchmark/README.md`** - -```markdown -# Archie Benchmark Harness (internal) - -Measures Archie's effectiveness: runs the **same** task headlessly on a control -branch (no Archie) and a treatment branch (full Archie docs + hooks), capturing -tool calls / tokens / cost / time + a blind judge-Claude quality score, and writes -results to Supabase. **Not** shipped via npm. - -## Usage - -```bash -# 1. Author a config (see example below) — JSON, zero-dep. -# 2. From a plain repo, prep branches then run: -python3 -m archie.benchmark auto /path/to/repo --prompt "Add a sleep timer feature" - -# Or with a config file: -python3 -m archie.benchmark run config.json # branches must already exist -python3 -m archie.benchmark prep config.json # only create/refresh branches -``` - -If the repo has no Archie files yet, `auto`/`prep` create the branches, then pause -so you can run `/archie-deep-scan` interactively on the treatment branch. That -deep-scan is **never** counted in the measured metrics. - -## Config - -```json -{ - "name": "bedtime-add-sleep-timer", - "repo": "/Users/you/DEV/BedtimeApp", - "task_prompt": "Add a sleep timer feature ...", - "model": "claude-sonnet-4-6", - "repetitions": 3, - "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}, - "judge": {"model": "claude-opus-4-8", "rubric": ["correctness", "completeness", "follows_conventions", "no_regressions"]}, - "timeout_seconds": 3600 -} -``` - -## Supabase - -Set `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` in the environment. Without them the -harness writes `.archie/benchmark//results.json` locally (offline mode). -Apply `archie/benchmark/schema.sql` to the project once. - -## Fairness invariants - -- Identical `task_prompt`, `model`, and harness flags on both arms. -- Both branches descend from the same base commit (enforced). -- Deep-scan prep cost is separate (`prep_cost_usd`), never in sample metrics. -``` - -- [ ] **Step 2: Add a section to `CLAUDE.md`** (under "Commands", after "Tests") - -```markdown -### Benchmark Harness (internal) -```bash -# Measure Archie effectiveness: same task, control (no Archie) vs treatment (full Archie) -python3 -m archie.benchmark auto /path/to/repo --prompt "..." # prep + run from a plain repo -python3 -m archie.benchmark run config.json # run on existing branches -``` -Internal-only (not shipped via npm). Captures tool calls / tokens / cost / time + -blind judge-Claude quality, writes to Supabase (`benchmark_runs`, `benchmark_samples`). -See `archie/benchmark/README.md`. -``` - -- [ ] **Step 3: Run the full project test suite** - -Run: `python -m pytest tests/ -v` -Expected: PASS (existing tests + all `tests/benchmark/` tests) - -- [ ] **Step 4: Run the sync checker** (benchmark is internal, so it must NOT trip sync) - -Run: `python3 scripts/verify_sync.py` -Expected: PASS — confirms no accidental npm-package coupling - -- [ ] **Step 5: Commit** - -```bash -git add archie/benchmark/README.md CLAUDE.md -git commit -m "docs(benchmark): README + CLAUDE.md usage section" -``` - ---- - -## Self-Review Notes (completed by plan author) - -- **Spec coverage:** §1 purpose → Tasks 5/6/7/10; §4 config → Task 1; §5 data flow → Tasks 2/3/4/5/10; §6 auto/prep → Task 11; §7 judge → Task 6; §8 schema/store → Tasks 8/9; §9 error/fairness/cleanup → Tasks 4/10/11; §10 testing → every task is TDD. All covered. -- **Type consistency:** `SampleMetrics` fields, judge result dict keys (`treatment`/`control`/`overall`), and sample-row keys match across `metrics.py`, `judge.py`, `orchestrator._sample_row`, `aggregate.py`, `store.py`, and `schema.sql`. -- **Out of scope (per spec §11):** website display, end-user shipped mode, headless deep-scan, Agent SDK — none included. -- **Open implementation note:** `prep_cost_usd` stays `None` in v1 (best-effort per spec); a follow-up can read `.archie/telemetry/` after the interactive deep-scan to populate it. -``` diff --git a/tests/benchmark/__init__.py b/tests/benchmark/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/benchmark/test_aggregate.py b/tests/benchmark/test_aggregate.py new file mode 100644 index 00000000..81f36b41 --- /dev/null +++ b/tests/benchmark/test_aggregate.py @@ -0,0 +1,74 @@ +from archie.benchmark.aggregate import aggregate_samples + + +def _s(arm, cost, tools, quality, completed=True): + return {"arm": arm, "cost_usd": cost, "tool_calls": tools, + "duration_ms": 1000, "input_tokens": 10, "output_tokens": 20, + "quality_score": quality, "completed": completed} + + +def test_per_arm_means(): + samples = [ + _s("treatment", 1.0, 10, 8.0), + _s("treatment", 3.0, 20, 9.0), + _s("control", 2.0, 30, 6.0), + _s("control", 4.0, 40, 7.0), + ] + agg = aggregate_samples(samples) + assert agg["treatment"]["cost_usd_mean"] == 2.0 + assert agg["treatment"]["tool_calls_mean"] == 15.0 + assert agg["treatment"]["quality_mean"] == 8.5 + assert agg["control"]["cost_usd_mean"] == 3.0 + assert agg["treatment"]["n"] == 2 + assert agg["treatment"]["completed_n"] == 2 + + +def test_savings_percentages(): + samples = [_s("treatment", 1.0, 10, 8.0), _s("control", 2.0, 20, 8.0)] + agg = aggregate_samples(samples) + # treatment cost is 50% lower than control + assert agg["savings"]["cost_pct"] == 50.0 + assert agg["savings"]["tool_calls_pct"] == 50.0 + + +def test_quality_ignores_none_scores(): + samples = [ + _s("treatment", 1.0, 10, None, completed=False), + _s("treatment", 1.0, 10, 8.0), + _s("control", 1.0, 10, 6.0), + ] + agg = aggregate_samples(samples) + assert agg["treatment"]["quality_mean"] == 8.0 # None excluded + assert agg["treatment"]["completed_n"] == 1 + + +def test_handles_empty_arm(): + samples = [_s("treatment", 1.0, 10, 8.0)] + agg = aggregate_samples(samples) + assert agg["control"]["n"] == 0 + assert agg["control"]["cost_usd_mean"] is None + assert agg["savings"]["cost_pct"] is None + + +def test_attempted_n_and_quality_excludes_not_attempted(): + samples = [ + {"arm": "treatment", "cost_usd": 1.0, "tool_calls": 9, "duration_ms": 100, + "input_tokens": 1, "output_tokens": 1, "quality_score": 8.0, + "completed": True, "attempted": True}, + {"arm": "control", "cost_usd": 0.5, "tool_calls": 28, "duration_ms": 100, + "input_tokens": 1, "output_tokens": 1, "quality_score": 1.0, + "completed": True, "attempted": False}, + ] + agg = aggregate_samples(samples) + assert agg["treatment"]["attempted_n"] == 1 + assert agg["control"]["attempted_n"] == 0 + # control's q1 came from an empty diff -> excluded from quality_mean + assert agg["treatment"]["quality_mean"] == 8.0 + assert agg["control"]["quality_mean"] is None + + +def test_attempted_defaults_true_for_legacy_samples(): + # samples without an explicit 'attempted' key count as attempted (back-compat) + agg = aggregate_samples([_s("treatment", 1.0, 10, 8.0)]) + assert agg["treatment"]["attempted_n"] == 1 + assert agg["treatment"]["quality_mean"] == 8.0 diff --git a/tests/benchmark/test_config.py b/tests/benchmark/test_config.py new file mode 100644 index 00000000..68bd72fa --- /dev/null +++ b/tests/benchmark/test_config.py @@ -0,0 +1,70 @@ +# tests/benchmark/test_config.py +import json +import pytest +from pathlib import Path +from archie.benchmark.config import parse_config, load_config, BenchmarkConfig + + +def _valid(): + return { + "name": "demo", + "repo": "/tmp/repo", + "task_prompt": "Add a feature", + "model": "claude-sonnet-4-6", + } + + +def test_parse_minimal_applies_defaults(): + cfg = parse_config(_valid()) + assert isinstance(cfg, BenchmarkConfig) + assert cfg.repetitions == 3 + assert cfg.timeout_seconds == 3600 + assert cfg.branches == {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"} + assert cfg.judge.model == "claude-opus-4-8" + assert "correctness" in cfg.judge.rubric + assert isinstance(cfg.repo, Path) + + +def test_parse_overrides(): + data = _valid() + data.update({ + "repetitions": 5, + "timeout_seconds": 1200, + "branches": {"treatment": "t", "control": "c"}, + "judge": {"model": "m", "rubric": ["x"]}, + }) + cfg = parse_config(data) + assert cfg.repetitions == 5 + assert cfg.timeout_seconds == 1200 + assert cfg.branches == {"treatment": "t", "control": "c"} + assert cfg.judge.model == "m" + assert cfg.judge.rubric == ["x"] + + +@pytest.mark.parametrize("missing", ["name", "repo", "task_prompt", "model"]) +def test_missing_required_raises(missing): + data = _valid() + del data[missing] + with pytest.raises(ValueError, match="required"): + parse_config(data) + + +def test_repetitions_must_be_positive(): + data = _valid() + data["repetitions"] = 0 + with pytest.raises(ValueError, match="repetitions"): + parse_config(data) + + +def test_branches_missing_arm_raises(): + data = _valid() + data["branches"] = {"treatment": "t"} + with pytest.raises(ValueError, match="control"): + parse_config(data) + + +def test_load_config_reads_file(tmp_path): + p = tmp_path / "c.json" + p.write_text(json.dumps(_valid())) + cfg = load_config(p) + assert cfg.name == "demo" diff --git a/tests/benchmark/test_diff.py b/tests/benchmark/test_diff.py new file mode 100644 index 00000000..7d0891c8 --- /dev/null +++ b/tests/benchmark/test_diff.py @@ -0,0 +1,61 @@ +import subprocess +from archie.benchmark.diff import capture_diff + + +def _git(args, cwd): + subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) + + +def _init_repo(path): + _git(["init"], path) + _git(["config", "user.email", "t@t.t"], path) + _git(["config", "user.name", "t"], path) + (path / "a.txt").write_text("one\n") + _git(["add", "-A"], path) + _git(["commit", "-m", "init"], path) + + +def test_captures_modified_and_untracked(tmp_path): + _init_repo(tmp_path) + (tmp_path / "a.txt").write_text("one\ntwo\n") # modified, tracked + (tmp_path / "b.txt").write_text("new file\n") # untracked + diff = capture_diff(tmp_path) + assert "a.txt" in diff + assert "two" in diff + assert "b.txt" in diff + assert "new file" in diff + + +def test_empty_diff_when_no_changes(tmp_path): + _init_repo(tmp_path) + diff = capture_diff(tmp_path) + assert diff.strip() == "" + + +def test_excludes_build_and_cache_noise(tmp_path): + _init_repo(tmp_path) + (tmp_path / "calc.py").write_text("def f():\n return 1\n") # real change + # universal build/cache noise (no .gitignore in this repo): + pyc_dir = tmp_path / "__pycache__" + pyc_dir.mkdir() + (pyc_dir / "calc.cpython-311.pyc").write_text("BYTECODE") + (tmp_path / ".DS_Store").write_text("junk") + nm = tmp_path / "node_modules" / "left-pad" + nm.mkdir(parents=True) + (nm / "index.js").write_text("module.exports = 1\n") + + diff = capture_diff(tmp_path) + # real source change is present + assert "calc.py" in diff + # noise is excluded + assert "__pycache__" not in diff + assert "calc.cpython-311.pyc" not in diff + assert ".DS_Store" not in diff + assert "node_modules" not in diff + + +def test_still_includes_plain_untracked(tmp_path): + _init_repo(tmp_path) + (tmp_path / "b.txt").write_text("new file\n") + diff = capture_diff(tmp_path) + assert "b.txt" in diff diff --git a/tests/benchmark/test_isolation.py b/tests/benchmark/test_isolation.py new file mode 100644 index 00000000..7fff32b7 --- /dev/null +++ b/tests/benchmark/test_isolation.py @@ -0,0 +1,49 @@ +# tests/benchmark/test_isolation.py +import subprocess +from pathlib import Path +from archie.benchmark.isolation import worktree, prune + + +def _git(args, cwd): + subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) + + +def _init_repo(path): + _git(["init"], path) + _git(["config", "user.email", "t@t.t"], path) + _git(["config", "user.name", "t"], path) + (path / "a.txt").write_text("one\n") + _git(["add", "-A"], path) + _git(["commit", "-m", "init"], path) + _git(["branch", "feature"], path) + + +def test_worktree_created_and_removed(tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + _init_repo(repo) + dest = tmp_path / "wt" + with worktree(repo, "feature", dest) as wt: + assert Path(wt).exists() + assert (Path(wt) / "a.txt").exists() + assert not Path(dest).exists() + + +def test_worktree_removed_on_exception(tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + _init_repo(repo) + dest = tmp_path / "wt" + try: + with worktree(repo, "feature", dest): + raise RuntimeError("boom") + except RuntimeError: + pass + assert not Path(dest).exists() + + +def test_prune_runs_without_error(tmp_path): + repo = tmp_path / "repo" + repo.mkdir() + _init_repo(repo) + prune(repo) # must not raise diff --git a/tests/benchmark/test_judge.py b/tests/benchmark/test_judge.py new file mode 100644 index 00000000..4351ee7f --- /dev/null +++ b/tests/benchmark/test_judge.py @@ -0,0 +1,68 @@ +# tests/benchmark/test_judge.py +import json +import pytest +from archie.benchmark import judge + + +def test_assign_order_is_seed_deterministic(): + assert judge.assign_order(0) == ("a", "b") + assert judge.assign_order(2) == ("a", "b") + assert judge.assign_order(1) == ("b", "a") + assert judge.assign_order(3) == ("b", "a") + + +def test_parse_judge_output_extracts_embedded_json(): + text = 'Here is my verdict:\n{"variant_a": {"overall": 8}, "variant_b": {"overall": 5}}\nThanks' + parsed = judge.parse_judge_output(text) + assert parsed["variant_a"]["overall"] == 8 + + +def test_parse_judge_output_raises_without_json(): + with pytest.raises(ValueError, match="JSON"): + judge.parse_judge_output("no json here") + + +def test_run_judge_maps_variants_to_arms_seed_even(): + # seed even -> treatment is variant_a + payload = json.dumps({"variant_a": {"overall": 9}, "variant_b": {"overall": 4}}) + calls = [] + + def fake_runner(prompt, model, timeout): + calls.append((prompt, model)) + return payload + + result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF", + rubric=["correctness"], model="m", seed=0, + _runner=fake_runner) + assert result["treatment"]["overall"] == 9 + assert result["control"]["overall"] == 4 + assert result["seed"] == 0 + # variant A diff (shown first) must be the treatment diff for an even seed + assert calls[0][0].index("TREAT_DIFF") < calls[0][0].index("CTRL_DIFF") + + +def test_run_judge_maps_variants_to_arms_seed_odd(): + # seed odd -> treatment is variant_b + payload = json.dumps({"variant_a": {"overall": 3}, "variant_b": {"overall": 7}}) + result = judge.run_judge("task", "TREAT_DIFF", "CTRL_DIFF", + rubric=["correctness"], model="m", seed=1, + _runner=lambda p, m, t: payload) + assert result["treatment"]["overall"] == 7 + assert result["control"]["overall"] == 3 + + +def test_run_judge_retries_once_on_bad_json(): + outputs = ["garbage", json.dumps({"variant_a": {"overall": 6}, "variant_b": {"overall": 6}})] + + def flaky(prompt, model, timeout): + return outputs.pop(0) + + result = judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0, _runner=flaky) + assert result["treatment"]["overall"] == 6 + assert outputs == [] # both outputs consumed -> retried exactly once + + +def test_run_judge_raises_after_two_failures(): + with pytest.raises(ValueError): + judge.run_judge("task", "A", "B", rubric=["c"], model="m", seed=0, + _runner=lambda p, m, t: "still garbage") diff --git a/tests/benchmark/test_metrics.py b/tests/benchmark/test_metrics.py new file mode 100644 index 00000000..5362be11 --- /dev/null +++ b/tests/benchmark/test_metrics.py @@ -0,0 +1,74 @@ +# tests/benchmark/test_metrics.py +import json +from archie.benchmark.metrics import parse_stream, SampleMetrics + + +def _assistant(blocks): + return json.dumps({"type": "assistant", "message": {"content": blocks}}) + + +def _tool_use(name): + return {"type": "tool_use", "name": name, "id": "x", "input": {}} + + +def _result(subtype="success"): + return json.dumps({ + "type": "result", + "subtype": subtype, + "total_cost_usd": 0.1234, + "duration_ms": 5000, + "num_turns": 7, + "usage": { + "input_tokens": 100, + "output_tokens": 200, + "cache_read_input_tokens": 50, + "cache_creation_input_tokens": 25, + }, + }) + + +def test_counts_tools_and_breakdown(): + lines = [ + json.dumps({"type": "system", "subtype": "init"}), + _assistant([{"type": "text", "text": "hi"}, _tool_use("Read")]), + _assistant([_tool_use("Edit"), _tool_use("Edit")]), + _result(), + ] + m = parse_stream(lines) + assert m.tool_calls == 3 + assert m.tool_breakdown == {"Read": 1, "Edit": 2} + + +def test_extracts_result_fields(): + m = parse_stream([_result()]) + assert m.input_tokens == 100 + assert m.output_tokens == 200 + assert m.cache_read_tokens == 50 + assert m.cache_creation_tokens == 25 + assert m.cost_usd == 0.1234 + assert m.duration_ms == 5000 + assert m.num_turns == 7 + assert m.completed is True + + +def test_error_result_not_completed(): + m = parse_stream([_result(subtype="error_max_turns")]) + assert m.completed is False + + +def test_zero_tool_run(): + m = parse_stream([_assistant([{"type": "text", "text": "done"}]), _result()]) + assert m.tool_calls == 0 + assert m.tool_breakdown == {} + + +def test_ignores_blank_and_malformed_lines(): + m = parse_stream(["", " ", "not json", _result()]) + assert m.completed is True + + +def test_no_result_event_defaults(): + m = parse_stream([_assistant([_tool_use("Bash")])]) + assert m.tool_calls == 1 + assert m.completed is False + assert m.cost_usd == 0.0 diff --git a/tests/benchmark/test_orchestrator.py b/tests/benchmark/test_orchestrator.py new file mode 100644 index 00000000..1beb3a12 --- /dev/null +++ b/tests/benchmark/test_orchestrator.py @@ -0,0 +1,214 @@ +# tests/benchmark/test_orchestrator.py +import subprocess +import pytest +from archie.benchmark.config import BenchmarkConfig, JudgeConfig +from archie.benchmark.metrics import SampleMetrics +from archie.benchmark import orchestrator + + +def _cfg(tmp_path, reps=2): + return BenchmarkConfig( + name="demo", repo=tmp_path, task_prompt="do it", + model="m", branches={"treatment": "treatment", "control": "control"}, + repetitions=reps, judge=JudgeConfig(model="jm", rubric=["correctness"]), + timeout_seconds=60, + ) + + +def _fake_run(metrics_by_branch): + seen = {"calls": []} + + def run_fn(prompt, model, cwd, timeout): + # branch name is encoded in the worktree path by the orchestrator + branch = "treatment" if "treatment" in str(cwd) else "control" + seen["calls"].append((branch, prompt, model)) + return metrics_by_branch[branch], "raw" + + return run_fn, seen + + +def test_run_benchmark_builds_matrix_and_aggregates(tmp_path, monkeypatch): + # neutralize real worktree/diff/base-commit side effects + monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "abc123") + monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "abc123") + + import contextlib + @contextlib.contextmanager + def fake_worktree(repo, branch, dest): + yield tmp_path / ("wt-" + branch) + monkeypatch.setattr(orchestrator, "worktree", fake_worktree) + monkeypatch.setattr(orchestrator, "prune", lambda repo: None) + + t_metrics = SampleMetrics(tool_calls=5, cost_usd=1.0, duration_ms=100, + input_tokens=10, output_tokens=20, completed=True) + c_metrics = SampleMetrics(tool_calls=12, cost_usd=3.0, duration_ms=300, + input_tokens=30, output_tokens=40, completed=True) + run_fn, seen = _fake_run({"treatment": t_metrics, "control": c_metrics}) + + judged = {"calls": 0} + def judge_fn(task, t_diff, c_diff, rubric, model, seed): + judged["calls"] += 1 + return {"treatment": {"overall": 9.0}, "control": {"overall": 5.0}, "seed": seed} + + stored = {} + def store_fn(run_row, sample_rows, offline_path): + stored["run"] = run_row + stored["samples"] = sample_rows + return {"mode": "offline", "path": str(offline_path)} + + result = orchestrator.run_benchmark( + _cfg(tmp_path, reps=2), + run_fn=run_fn, judge_fn=judge_fn, store_fn=store_fn, + diff_fn=lambda wt: f"diff:{wt}", + ) + + # 2 reps x 2 arms = 4 runs; 2 reps = 2 pairwise judge calls + assert len(seen["calls"]) == 4 + assert judged["calls"] == 2 + assert len(stored["samples"]) == 4 + # quality assigned per arm + t_samples = [s for s in stored["samples"] if s["arm"] == "treatment"] + assert all(s["quality_score"] == 9.0 for s in t_samples) + # aggregate shows treatment cheaper + assert result["aggregate"]["savings"]["cost_pct"] > 0 + # prompt identical across all runs + assert len({c[1] for c in seen["calls"]}) == 1 + + +def test_fairness_guard_rejects_unrelated_branches(tmp_path, monkeypatch): + # No common ancestor -> git merge-base exits non-zero. + def no_common_ancestor(repo, a, b): + raise subprocess.CalledProcessError(128, ["git", "merge-base", a, b]) + monkeypatch.setattr(orchestrator, "_merge_base", no_common_ancestor) + with pytest.raises(ValueError, match="common ancestor"): + orchestrator.run_benchmark(_cfg(tmp_path), run_fn=lambda *a: None, + judge_fn=lambda *a, **k: None, + store_fn=lambda *a: None, diff_fn=lambda w: "") + + +def test_failed_sample_does_not_sink_run(tmp_path, monkeypatch): + monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "same") + monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "base") + import contextlib + @contextlib.contextmanager + def fake_worktree(repo, branch, dest): + yield tmp_path / ("wt-" + branch) + monkeypatch.setattr(orchestrator, "worktree", fake_worktree) + monkeypatch.setattr(orchestrator, "prune", lambda repo: None) + + def run_fn(prompt, model, cwd, timeout): + if "treatment" in str(cwd): + raise RuntimeError("treatment crashed") + return SampleMetrics(completed=True, cost_usd=2.0), "raw" + + stored = {} + result = orchestrator.run_benchmark( + _cfg(tmp_path, reps=1), + run_fn=run_fn, + judge_fn=lambda *a, **k: {"treatment": {"overall": 0}, "control": {"overall": 5}, "seed": 0}, + store_fn=lambda r, s, p: stored.update(samples=s) or {"mode": "offline"}, + diff_fn=lambda wt: "", + ) + # treatment sample recorded as not-completed; control still present + arms = {s["arm"]: s for s in stored["samples"]} + assert arms["treatment"]["completed"] is False + assert arms["control"]["completed"] is True + + +def test_judge_failure_does_not_sink_run(tmp_path, monkeypatch): + monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "same") + monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "base") + import contextlib + @contextlib.contextmanager + def fake_worktree(repo, branch, dest): + yield tmp_path / ("wt-" + branch) + monkeypatch.setattr(orchestrator, "worktree", fake_worktree) + monkeypatch.setattr(orchestrator, "prune", lambda repo: None) + + def boom(*a, **k): + raise ValueError("judge returned unparseable output twice") + + stored = {} + result = orchestrator.run_benchmark( + _cfg(tmp_path, reps=1), + run_fn=lambda p, m, cwd, t: (SampleMetrics(completed=True, cost_usd=1.0), "raw"), + judge_fn=boom, + store_fn=lambda r, s, p: stored.update(samples=s) or {"mode": "offline"}, + diff_fn=lambda wt: "", + ) + # both samples still recorded and stored, with no quality score + assert len(stored["samples"]) == 2 + assert all(s["quality_score"] is None for s in stored["samples"]) + assert all(s["quality_detail"] is None for s in stored["samples"]) + # the run still produced an aggregate + assert result["aggregate"]["treatment"]["n"] == 1 + + +def _git(args, cwd): + subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) + + +def test_prepared_branches_pass_fairness_guard(tmp_path): + # Regression: prepare_branches adds a strip commit to the control branch, so + # the two branch TIPS differ. The fairness guard must compare the merge-base + # (common ancestor), not the tips, and therefore must NOT raise here. + repo = tmp_path / "repo" + repo.mkdir() + _git(["init"], repo) + _git(["config", "user.email", "t@t.t"], repo) + _git(["config", "user.name", "t"], repo) + (repo / "CLAUDE.md").write_text("# conventions\n") + (repo / "calc.py").write_text("def add(a, b):\n return a + b\n") + _git(["add", "-A"], repo) + _git(["commit", "-m", "init"], repo) + + cfg = BenchmarkConfig( + name="reg", repo=repo, task_prompt="do it", model="m", + branches={"treatment": "archie-bench/with-archie", + "control": "archie-bench/no-archie"}, + repetitions=1, judge=JudgeConfig(), timeout_seconds=60, + ) + status = orchestrator.prepare_branches(cfg) + assert status["archie_present"] is True # control got an extra strip commit + + # Real worktree creation; only run_fn/judge_fn/store_fn/diff_fn are stubbed. + result = orchestrator.run_benchmark( + cfg, + run_fn=lambda p, m, cwd, t: (SampleMetrics(completed=True, cost_usd=1.0), "raw"), + judge_fn=lambda *a, **k: {"treatment": {"overall": 8}, + "control": {"overall": 6}, "seed": 0}, + store_fn=lambda r, s, p: {"mode": "offline", "run": r}, + diff_fn=lambda wt: "diff", + ) + assert result["aggregate"]["treatment"]["n"] == 1 + assert result["aggregate"]["control"]["n"] == 1 + # git_base_commit is the shared merge-base (a real 40-char sha) + assert len(result["run"]["git_base_commit"]) == 40 + + +def test_empty_diff_marks_not_attempted(tmp_path, monkeypatch): + monkeypatch.setattr(orchestrator, "_merge_base", lambda repo, a, b: "same") + monkeypatch.setattr(orchestrator, "_base_commit", lambda repo: "base") + import contextlib + @contextlib.contextmanager + def fake_worktree(repo, branch, dest): + yield tmp_path / ("wt-" + branch) + monkeypatch.setattr(orchestrator, "worktree", fake_worktree) + monkeypatch.setattr(orchestrator, "prune", lambda repo: None) + + # treatment produces a diff; control produces an empty (whitespace-only) diff + def diff_fn(wt): + return "real change\n" if "treatment" in str(wt) else " \n" + + stored = {} + orchestrator.run_benchmark( + _cfg(tmp_path, reps=1), + run_fn=lambda p, m, cwd, t: (SampleMetrics(completed=True, cost_usd=1.0), "raw"), + judge_fn=lambda *a, **k: {"treatment": {"overall": 7}, + "control": {"overall": 1}, "seed": 0}, + store_fn=lambda r, s, p: stored.update(samples=s), + diff_fn=diff_fn, + ) + arms = {s["arm"]: s for s in stored["samples"]} + assert arms["treatment"]["attempted"] is True + assert arms["control"]["attempted"] is False diff --git a/tests/benchmark/test_prepare.py b/tests/benchmark/test_prepare.py new file mode 100644 index 00000000..c6333ebe --- /dev/null +++ b/tests/benchmark/test_prepare.py @@ -0,0 +1,115 @@ +# tests/benchmark/test_prepare.py +import subprocess +import pytest +from archie.benchmark.config import BenchmarkConfig, JudgeConfig +from archie.benchmark import orchestrator as orch + + +def _git(args, cwd): + subprocess.run(["git", *args], cwd=cwd, check=True, capture_output=True, text=True) + + +def _repo(tmp_path, with_archie): + repo = tmp_path / "repo" + repo.mkdir() + _git(["init"], repo) + _git(["config", "user.email", "t@t.t"], repo) + _git(["config", "user.name", "t"], repo) + (repo / "src.py").write_text("print('hi')\n") + if with_archie: + (repo / "CLAUDE.md").write_text("# context\n") + (repo / ".claude").mkdir() + (repo / ".claude" / "settings.json").write_text("{}\n") + _git(["add", "-A"], repo) + _git(["commit", "-m", "init"], repo) + return repo + + +def _cfg(repo): + return BenchmarkConfig(name="d", repo=repo, task_prompt="x", model="m", + branches={"treatment": "archie-bench/with-archie", + "control": "archie-bench/no-archie"}, + repetitions=1, judge=JudgeConfig(), timeout_seconds=60) + + +def test_clean_tree_required(tmp_path): + repo = _repo(tmp_path, with_archie=True) + (repo / "dirty.txt").write_text("uncommitted\n") + with pytest.raises(ValueError, match="clean"): + orch.prepare_branches(_cfg(repo)) + + +def test_control_branch_strips_archie_files(tmp_path): + repo = _repo(tmp_path, with_archie=True) + status = orch.prepare_branches(_cfg(repo)) + # control branch checked out: Archie files gone + _git(["checkout", "archie-bench/no-archie"], repo) + assert not (repo / "CLAUDE.md").exists() + assert not (repo / ".claude").exists() + assert (repo / "src.py").exists() + assert status["archie_present"] is True + assert status["needs_deep_scan"] is False + + +def test_control_strips_nested_claude_md_with_spaces_in_path(tmp_path): + # Regression: directory names with spaces (e.g. Xcode asset catalogs like + # "Button icons/") must not survive the strip. The old impl split git + # ls-files output on all whitespace, fragmenting space-containing paths so + # `git rm` silently skipped them — leaking Archie context onto the control arm. + repo = _repo(tmp_path, with_archie=True) + nested_dir = repo / "Assets" / "Button icons" + nested_dir.mkdir(parents=True) + (nested_dir / "CLAUDE.md").write_text("# Button icons\n\n") + (nested_dir / "icon.txt").write_text("keep me\n") # non-Archie file must survive + _git(["add", "-A"], repo) + _git(["commit", "-m", "add nested archie context in spaced dir"], repo) + + orch.prepare_branches(_cfg(repo)) + + leftover = subprocess.run( + ["git", "ls-tree", "-r", "--name-only", "archie-bench/no-archie"], + cwd=repo, check=True, capture_output=True, text=True).stdout.splitlines() + assert not any(p.endswith("CLAUDE.md") for p in leftover), \ + f"control arm still has CLAUDE.md files: {[p for p in leftover if p.endswith('CLAUDE.md')]}" + assert "Assets/Button icons/icon.txt" in leftover # real content preserved + + +def test_treatment_keeps_archie_files(tmp_path): + repo = _repo(tmp_path, with_archie=True) + orch.prepare_branches(_cfg(repo)) + _git(["checkout", "archie-bench/with-archie"], repo) + assert (repo / "CLAUDE.md").exists() + + +def test_no_archie_flags_deep_scan_needed(tmp_path): + repo = _repo(tmp_path, with_archie=False) + status = orch.prepare_branches(_cfg(repo)) + assert status["archie_present"] is False + assert status["needs_deep_scan"] is True + + +def test_cli_run_invokes_benchmark(tmp_path, monkeypatch): + import json + from archie.benchmark import cli + repo = _repo(tmp_path, with_archie=True) + cfg_file = tmp_path / "c.json" + cfg_file.write_text(json.dumps({ + "name": "d", "repo": str(repo), "task_prompt": "x", "model": "m", + "branches": {"treatment": "archie-bench/with-archie", "control": "archie-bench/no-archie"}, + })) + called = {} + monkeypatch.setattr(cli, "run_benchmark", + lambda cfg: called.update(ran=True) or { + "aggregate": {"treatment": {"n": 0, "completed_n": 0, + "attempted_n": 0, + "cost_usd_mean": None, "tool_calls_mean": None, + "duration_ms_mean": None, "quality_mean": None}, + "control": {"n": 0, "completed_n": 0, "attempted_n": 0, + "cost_usd_mean": None, + "tool_calls_mean": None, "duration_ms_mean": None, + "quality_mean": None}, + "savings": {"cost_pct": None, "tool_calls_pct": None, + "duration_pct": None}}, + "store": {"mode": "offline"}}) + cli.main(["run", str(cfg_file)]) + assert called["ran"] is True diff --git a/tests/benchmark/test_runner.py b/tests/benchmark/test_runner.py new file mode 100644 index 00000000..f48f339c --- /dev/null +++ b/tests/benchmark/test_runner.py @@ -0,0 +1,69 @@ +# tests/benchmark/test_runner.py +import json +import subprocess +import pytest +from archie.benchmark import runner + + +def _stream(): + return "\n".join([ + json.dumps({"type": "assistant", "message": {"content": [{"type": "tool_use", "name": "Edit"}]}}), + json.dumps({"type": "result", "subtype": "success", "total_cost_usd": 0.5, + "duration_ms": 1000, "num_turns": 2, + "usage": {"input_tokens": 10, "output_tokens": 20}}), + ]) + + +def test_run_claude_parses_metrics(monkeypatch): + captured = {} + + def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None): + captured["cmd"] = cmd + captured["cwd"] = cwd + return subprocess.CompletedProcess(cmd, 0, stdout=_stream(), stderr="") + + monkeypatch.setattr(runner.subprocess, "run", fake_run) + metrics, raw = runner.run_claude("do it", "claude-sonnet-4-6", "/tmp/wt", 60) + + assert metrics.tool_calls == 1 + assert metrics.cost_usd == 0.5 + assert metrics.completed is True + assert captured["cwd"] == "/tmp/wt" + # identical, fair harness flags must always be present + assert captured["cmd"][:2] == ["claude", "-p"] + assert "--permission-mode" in captured["cmd"] + assert "acceptEdits" in captured["cmd"] + assert "stream-json" in captured["cmd"] + assert "--model" in captured["cmd"] and "claude-sonnet-4-6" in captured["cmd"] + + +def test_prompt_wraps_task_with_autonomy_framing(monkeypatch): + # Headless `claude -p` has no human to approve/answer. Without an explicit + # autonomy directive the agent obeys global "describe approach and wait for + # approval / ask clarifying questions" rules and stops without editing — + # producing an empty diff on both arms. The wrapper must override that. + captured = {} + + def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None): + captured["cmd"] = cmd + return subprocess.CompletedProcess(cmd, 0, stdout=_stream(), stderr="") + + monkeypatch.setattr(runner.subprocess, "run", fake_run) + runner.run_claude("Add a sleep timer feature", "m", "/tmp/wt", 60) + + sent = captured["cmd"][2] + assert "Add a sleep timer feature" in sent # original task preserved verbatim + low = sent.lower() + assert "autonomous" in low # framed as autonomous + assert "do not ask" in low # overrides clarifying-questions rule + assert "do not stop" in low # overrides wait-for-approval rule + + +def test_timeout_marks_incomplete(monkeypatch): + def fake_run(cmd, cwd=None, capture_output=None, text=None, timeout=None): + raise subprocess.TimeoutExpired(cmd, timeout, output=_stream()) + + monkeypatch.setattr(runner.subprocess, "run", fake_run) + metrics, raw = runner.run_claude("do it", "m", "/tmp/wt", 1) + assert metrics.completed is False + assert metrics.tool_calls == 1 # partial stdout still parsed diff --git a/tests/benchmark/test_schema.py b/tests/benchmark/test_schema.py new file mode 100644 index 00000000..c109f105 --- /dev/null +++ b/tests/benchmark/test_schema.py @@ -0,0 +1,18 @@ +# tests/benchmark/test_schema.py +from pathlib import Path + +SQL = Path(__file__).parent.parent.parent / "archie" / "benchmark" / "schema.sql" + + +def test_schema_defines_both_tables_and_view(): + text = SQL.read_text() + assert "create table" in text.lower() + assert "benchmark_runs" in text + assert "benchmark_samples" in text + assert "benchmark_summary" in text + # key sample columns referenced by store.py / aggregate.py exist + for col in ["tool_calls", "tool_breakdown", "cost_usd", "quality_score", + "cache_read_tokens", "judge_seed", "completed", "attempted", "arm"]: + assert col in text + # prep cost lives on the run, separate from measured samples + assert "prep_cost_usd" in text diff --git a/tests/benchmark/test_store.py b/tests/benchmark/test_store.py new file mode 100644 index 00000000..e266fd49 --- /dev/null +++ b/tests/benchmark/test_store.py @@ -0,0 +1,35 @@ +# tests/benchmark/test_store.py +import json +from archie.benchmark import store + + +def test_offline_fallback_when_env_missing(tmp_path, monkeypatch): + monkeypatch.delenv("SUPABASE_URL", raising=False) + monkeypatch.delenv("SUPABASE_SERVICE_KEY", raising=False) + out = tmp_path / "nested" / "results.json" + res = store.store_results({"name": "x"}, [{"arm": "treatment"}], out) + assert res["mode"] == "offline" + saved = json.loads(out.read_text()) + assert saved["run"]["name"] == "x" + assert saved["samples"][0]["arm"] == "treatment" + + +def test_online_write_posts_run_then_samples(tmp_path, monkeypatch): + monkeypatch.setenv("SUPABASE_URL", "https://x.supabase.co") + monkeypatch.setenv("SUPABASE_SERVICE_KEY", "secret") + calls = [] + + def fake_poster(url, key, table, rows): + calls.append((table, rows)) + if table == "benchmark_runs": + return [{"id": "run-123"}] + return rows + + res = store.store_results({"name": "x"}, [{"arm": "treatment"}, {"arm": "control"}], + tmp_path / "r.json", _poster=fake_poster) + assert res["mode"] == "online" + assert res["run_id"] == "run-123" + assert calls[0][0] == "benchmark_runs" + assert calls[1][0] == "benchmark_samples" + # run_id stamped onto every sample row + assert all(r["run_id"] == "run-123" for r in calls[1][1])