Add a helper script to bisect a regression

mdboom · mdboom · commit 09760ca61a8f · 2025-04-30T10:25:35.000-04:00
diff --git a/bench_runner/__main__.py b/bench_runner/__main__.py
@@ -7,6 +7,7 @@
 
 COMMANDS = {
     "backfill": "Schedule benchmarking a number of commits",
+    "bisect": "Run a bisect to find the commit that caused a regression",
     "compare": "Compare a matrix of specific results",
     "find_failures": "Find the benchmarks that failed in the last weekly run",
     "generate_results": "Create all of the derived artifacts from raw data",
diff --git a/bench_runner/git.py b/bench_runner/git.py
@@ -168,3 +168,12 @@ def clone(
         if depth is not None:
             args += ["--depth", str(depth)]
         subprocess.check_call(args)
+
+
+def checkout(dirname: PathLike, ref: str) -> None:
+    dirname = Path(dirname)
+
+    subprocess.check_call(
+        ["git", "checkout", ref],
+        cwd=dirname,
+    )
diff --git a/bench_runner/scripts/bisect.py b/bench_runner/scripts/bisect.py
@@ -0,0 +1,232 @@
+import argparse
+import contextlib
+from pathlib import Path
+import subprocess
+import sys
+import traceback
+
+
+import numpy as np
+import rich_argparse
+
+
+from bench_runner import flags as mflags
+from bench_runner import git
+from bench_runner import result
+from bench_runner.scripts import run_benchmarks
+from bench_runner.scripts import workflow
+from bench_runner.util import PathLike, format_seconds
+
+
+def _get_result_commandline(
+    benchmark: str,
+    good_val: float,
+    bad_val: float,
+    pgo: bool,
+    flags: str,
+    repo: PathLike,
+) -> list[str]:
+    repo = Path(repo)
+
+    return [
+        sys.executable,
+        "-m",
+        "bench_runner.scripts.bisect",
+        benchmark,
+        str(good_val),
+        str(bad_val),
+        str(pgo),
+        str(flags),
+        str(repo.absolute()),
+    ]
+
+
+def parse_result(benchmark_json: PathLike) -> float:
+    # The name of the benchmark in the JSON file may be different from the one
+    # used to select the benchmark. Therefore, just take the mean of all the
+    # benchmarks in the JSON file.
+    r = result.Result.from_arbitrary_filename(benchmark_json)
+    timing_data = r.get_timing_data()
+    return float(np.mean([x.mean() for x in timing_data.values()]))
+
+
+def get_result(
+    benchmark: str,
+    pgo: bool = False,
+    flags: str = "",
+    cpython: PathLike = Path("cpython"),
+    reconfigure: bool = False,
+) -> float:
+    cpython = Path(cpython)
+
+    if pgo or reconfigure:
+        # Jumping around through commits with PGO can leave stale PGO data
+        # around, so we need to clean it each time.  We also always do it the
+        # first time in case the *last* bisect run used pgo.
+        subprocess.run(["make", "clean"], cwd=cpython)
+
+    workflow.compile_unix(cpython, mflags.parse_flags(flags), pgo, False, reconfigure)
+    run_benchmarks.run_benchmarks(cpython / "python", benchmark)
+    timing = parse_result(run_benchmarks.BENCHMARK_JSON)
+
+    return timing
+
+
+def get_log_file() -> Path:
+    return Path("bisect_log.txt")
+
+
+def delete_log() -> None:
+    bisect_log = get_log_file()
+    if bisect_log.is_file():
+        bisect_log.unlink()
+
+
+def show_log() -> None:
+    bisect_log = get_log_file()
+
+    print()
+    print("Bisect log:")
+
+    with get_log_file().open("r") as f:
+        for line in f.readlines():
+            print(line.strip())
+
+
+def log(message: str) -> None:
+    with get_log_file().open("a") as f:
+        f.write(f"{message}\n")
+
+
+def _main(
+    benchmark: str,
+    good: str,
+    bad: str,
+    pgo: bool = False,
+    flags: str = "",
+    repo: PathLike = Path("."),
+    cpython: PathLike = Path("cpython"),
+):
+    repo = Path(repo).absolute()
+    cpython = Path(cpython).absolute()
+
+    delete_log()
+
+    if not cpython.is_dir():
+        git.clone(
+            cpython, "https://github.com/python/cpython.git", branch="main", depth=None
+        )
+
+    git.checkout(cpython, good)
+    good_timing = get_result(benchmark, pgo, flags, cpython=cpython, reconfigure=True)
+    log(f"KNOWN  GOOD ({good[:7]}): {format_seconds(good_timing)}")
+
+    git.checkout(cpython, bad)
+    bad_timing = get_result(benchmark, pgo, flags, cpython=cpython)
+    log(f"KNOWN  BAD  ({bad[:7]}): {format_seconds(bad_timing)}")
+
+    try:
+        with contextlib.chdir(cpython):
+            subprocess.run(["git", "bisect", "start"])
+            subprocess.run(["git", "bisect", "bad", bad])
+            subprocess.run(["git", "bisect", "good", good])
+            subprocess.run(
+                ["git", "bisect", "run"]
+                + _get_result_commandline(
+                    benchmark, good_timing, bad_timing, pgo, flags, repo
+                )
+            )
+    finally:
+        show_log()
+        delete_log()
+
+
+def main():
+    # This is the entry point for the user
+
+    parser = argparse.ArgumentParser(
+        description="""
+        Run bisect on a benchmark to find the first regressing commit.
+
+        A full checkout of CPython should be in the cpython directory.
+        If it doesn't exist, it will be cloned.
+        """,
+        formatter_class=rich_argparse.ArgumentDefaultsRichHelpFormatter,
+    )
+    parser.add_argument(
+        "benchmark",
+        type=str,
+        help="The benchmark to run bisect on.",
+    )
+    parser.add_argument(
+        "good",
+        type=str,
+        help="The good commit hash for the bisect.",
+    )
+    parser.add_argument(
+        "bad",
+        type=str,
+        help="The bad commit hash for the bisect.",
+    )
+    parser.add_argument(
+        "--pgo",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--flags",
+        type=str,
+        default="",
+    )
+
+    args = parser.parse_args()
+
+    _main(args.benchmark, args.good, args.bad, args.pgo, args.flags)
+
+
+if __name__ == "__main__":
+    # This is the entry point when we are called from `git bisect run` itself
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("benchmark", type=str)
+    parser.add_argument("good_val", type=float)
+    parser.add_argument("bad_val", type=float)
+    parser.add_argument("pgo", type=str)
+    parser.add_argument("flags", type=str)
+    parser.add_argument("repo", type=str)
+    args = parser.parse_args()
+
+    mid_point = (args.good_val + args.bad_val) / 2.0
+
+    repo = Path(args.repo)
+    cpython = repo / "cpython"
+
+    try:
+        with contextlib.chdir(repo):
+            timing = get_result(
+                args.benchmark, args.pgo == "True", args.flags, cpython=cpython
+            )
+    except Exception as e:
+        # If there was any exception, display that exception and traceback and
+        # then abort the git bisect with -1
+        traceback.print_exception(e)
+        sys.exit(-1)
+
+    # The confidence is 0.0 at the mid-point, 1.0 at the good and bad values,
+    # and > 1.0 outside of that.
+    confidence = abs((timing - mid_point) / ((args.bad_val - args.good_val) / 2.0))
+
+    with contextlib.chdir(repo):
+        if timing > mid_point:
+            log(
+                f"BISECT BAD  ({git.get_git_hash(cpython)[:7]}): "
+                f"{format_seconds(timing)} (confidence {confidence:.02f})"
+            )
+            print(f"BAD: {timing} vs. ({args.good_val}, {args.bad_val})")
+            sys.exit(1)
+        else:
+            log(
+                f"BISECT GOOD ({git.get_git_hash(cpython)[:7]}): "
+                f"{format_seconds(timing)} (confidence {confidence:.02f})"
+            )
+            print(f"GOOD: {timing} vs. ({args.good_val}, {args.bad_val})")
+            sys.exit(0)
diff --git a/bench_runner/scripts/workflow.py b/bench_runner/scripts/workflow.py
@@ -130,7 +130,13 @@ def checkout_benchmarks():
         )
 
 
-def compile_unix(cpython: PathLike, flags: list[str], pgo: bool, pystats: bool) -> None:
+def compile_unix(
+    cpython: PathLike,
+    flags: list[str],
+    pgo: bool,
+    pystats: bool,
+    reconfigure: bool = True,
+) -> None:
     cpython = Path(cpython)
     cfg = config.get_config_for_current_runner()
 
@@ -166,7 +172,8 @@ def compile_unix(cpython: PathLike, flags: list[str], pgo: bool, pystats: bool)
         make_args.extend(["-j"])
 
     with contextlib.chdir(cpython):
-        subprocess.check_call(["./configure", *args], env=env)
+        if reconfigure:
+            subprocess.check_call(["./configure", *args], env=env)
         subprocess.check_call(["make", *make_args], env=env)
 
 
diff --git a/bench_runner/util.py b/bench_runner/util.py
@@ -87,6 +87,25 @@ def get_simple_platform() -> Literal["linux", "macos", "windows"]:
     raise RuntimeError(f"Unsupported platform {sys.platform}.")
 
 
+def format_seconds(value: float) -> str:
+    """
+    Given a float value in seconds, formats it into a human-readable string with the appropriate precision.
+    """
+    _TIMEDELTA_UNITS = ("sec", "ms", "us", "ns")
+
+    for i in range(2, -9, -1):
+        if value >= 10.0**i:
+            break
+    else:
+        i = -9
+
+    precision = 2 - i % 3
+    k = -(i // 3) if i < 0 else 0
+    factor = 10 ** (k * 3)
+    unit = _TIMEDELTA_UNITS[k]
+    return f"{value * factor:.{precision}f} {unit}"
+
+
 if os.getenv("GITHUB_ACTIONS") == "true":
 
     @contextlib.contextmanager