Merge pull request #429 from mdboom/benchmark_longitudinal

mdboom · web-flow · commit cb183f865ebd · 2025-05-01T15:28:10.000-04:00
Add a by-benchmark longitudinal plot
diff --git a/README.md b/README.md
@@ -140,7 +140,7 @@ For each runner in your `bench_runner.toml`, you can specify a `plot` table with
 - `marker`: A [matplotlib marker](https://matplotlib.org/stable/api/markers_api.html#module-matplotlib.markers)
 - `color`: A [matplotlib color](https://matplotlib.org/stable/users/explain/colors/colors.html#colors-def)
 
-#### Longitudinal plot configuration
+##### Longitudinal plot configuration
 
 The longitudinal plot shows the change of a version branch over time against a specified base version. It is made up of multiple subplots, each with its own head and base, and optionally configuration flags.
 
@@ -162,7 +162,7 @@ subplots = [
 ]
 ```
 
-#### Flag effect plot configuration
+##### Flag effect plot configuration
 
 The flag effect plot shows the effect of specified configuration flags against a base with the same commit hash, but different configuration flags.
 
@@ -195,6 +195,16 @@ head_flags = ["TAILCALL"]
 runner_map = { linux_clang = "linux" }
 ```
 
+##### Benchmark longitudinal plot configuration
+
+The benchmark longitudinal plot shows the change over time, per benchmark. The configuration consists of the following keys:
+
+- `base`: The base version
+- `version`: The version to track
+- `runners`: The runners to show
+- `head_flags`: (optional) The flags to use for the head commits
+- `base_flags`: (optional) The flags to use for the base commits
+
 #### Purging old data
 
 With a local checkout of your results repository you can perform some maintenance tasks.
diff --git a/bench_runner/plot.py b/bench_runner/plot.py
@@ -2,6 +2,7 @@
 
 
 import argparse
+from collections import defaultdict
 import datetime
 import functools
 import json
@@ -118,6 +119,25 @@ def get_flag_effect_plot_config():
     return plots
 
 
+@functools.cache
+def get_benchmark_longitudinal_plot_config():
+    cfg = mconfig.get_bench_runner_config()
+
+    plot = cfg.get("benchmark_longitudinal_plot", {})
+    assert "base" in plot
+    assert "version" in plot
+    assert "runners" in plot
+    if "head_flags" not in plot:
+        plot["head_flags"] = []
+    else:
+        plot["head_flags"] = sorted(set(plot["head_flags"]))
+    if "base_flags" not in plot:
+        plot["base_flags"] = []
+    else:
+        plot["base_flags"] = sorted(set(plot["base_flags"]))
+    return plot
+
+
 def plot_diff_pair(ax, data):
     if not len(data):
         return []
@@ -477,11 +497,11 @@ def get_comparison_value(ref, r, force_valid):
             if subplot["runner_map"] and not runner_is_mapped:
                 continue
             head_results = commits.get(runner.nickname, {}).get(
-                tuple(subplot["head_flags"]), {}
+                tuple(sorted(subplot["head_flags"])), {}
             )
             base_results = commits.get(
                 subplot["runner_map"].get(runner.nickname, runner.nickname), {}
-            ).get(tuple(subplot["base_flags"]), {})
+            ).get(tuple(sorted(subplot["base_flags"])), {})
 
             line = []
             for cpython_hash, r in head_results.items():
@@ -525,6 +545,110 @@ def get_comparison_value(ref, r, force_valid):
         json.dump(data, fd, indent=2)
 
 
+def benchmark_longitudinal_plot(
+    results: Iterable[result.Result], output_filename: PathLike
+):
+    if "benchmark_longitudinal_plot" not in mconfig.get_bench_runner_config():
+        return
+
+    output_filename = Path(output_filename)
+
+    cache_filename = output_filename.with_suffix(".json")
+    if cache_filename.is_file():
+        with cache_filename.open() as fd:
+            cache = json.load(fd)
+    else:
+        cache = {}
+
+    cfg = get_benchmark_longitudinal_plot_config()
+
+    results = [
+        r for r in results if r.fork == "python" and r.nickname in cfg["runners"]
+    ]
+
+    base = None
+    for r in results:
+        if r.version == cfg["base"] and r.flags == cfg["base_flags"]:
+            base = r
+            break
+    else:
+        raise ValueError(f"Base version {cfg['base']} not found")
+
+    results = [
+        r
+        for r in results
+        if r.version.startswith(cfg["version"]) and r.flags == cfg["head_flags"]
+    ]
+
+    by_benchmark = defaultdict(lambda: defaultdict(list))
+    for r in results:
+        if r.filename.name not in cache:
+            comparison = result.BenchmarkComparison(base, r, "")
+            timing = comparison.get_timing_diff()
+
+            for name, _diff, mean in timing:
+                # Don't include insignificant results
+                if mean > 0.0:
+                    value = [r.commit_date, mean, r.cpython_hash]
+                    if r.filename.name not in cache:
+                        cache[r.filename.name] = {}
+                    cache[r.filename.name][name] = value
+
+        for name, value in cache[r.filename.name].items():
+            by_benchmark[name][r.nickname].append(value)
+
+    with cache_filename.open("w") as fd:
+        json.dump(cache, fd, indent=2)
+
+    # Exclude any benchmarks where we don't have enough data to make a
+    # meaningful plot
+    by_benchmark = {
+        k: v for k, v in by_benchmark.items() if any(len(x) > 2 for x in v.values())
+    }
+
+    fig, axs = plt.subplots(
+        len(by_benchmark),
+        1,
+        figsize=(10, len(by_benchmark)),
+        layout="constrained",
+    )
+    if len(by_benchmark) == 1:
+        axs = [axs]
+
+    plt.suptitle(
+        f"Performance change by benchmark on {cfg['version']} vs. {cfg['base']}"
+    )
+
+    first = True
+    for (benchmark, runners), ax in zip(sorted(by_benchmark.items()), axs):
+        for runner_name, timings in runners.items():
+            runner = mrunners.get_runner_by_nickname(runner_name)
+            timings.sort(key=lambda x: datetime.datetime.fromisoformat(x[0]))
+            dates = [datetime.datetime.fromisoformat(x[0]) for x in timings]
+            ax.plot(
+                dates,
+                [x[1] for x in timings],
+                label=runner.plot.name,
+                color=runner.plot.color,
+                linestyle=runner.plot.style,
+                marker=runner.plot.marker,
+                markersize=2,
+            )
+        ax.set_xticks([])
+        ax.set_ylabel(benchmark, rotation=0, horizontalalignment="right")
+        ax.yaxis.set_major_formatter(formatter)
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+        ax.grid(True, axis="y")
+        ax.axhline(1.0, color="#666", linestyle="-")
+        ax.set_facecolor("#f0f0f0")
+        if first:
+            ax.legend(loc="upper left")
+            first = False
+
+    savefig(output_filename, dpi=150)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         "Compare two benchmark .json files",
diff --git a/bench_runner/scripts/generate_results.py b/bench_runner/scripts/generate_results.py
@@ -419,6 +419,11 @@ def _main(repo_dir: PathLike, force: bool = False, bases: Sequence[str] | None =
                     title="Memory usage change by configuration",
                 ),
             ),
+            (
+                plot.benchmark_longitudinal_plot,
+                (benchmarking_results, repo_dir / "benchmarks.svg"),
+                {},
+            ),
         ],
         "Generating plots",
     ):
diff --git a/bench_runner/templates/README.md b/bench_runner/templates/README.md
@@ -9,6 +9,7 @@ Here are some recent and important revisions. 👉 [Complete list of results](RE
 [Currently failing benchmarks](failures.md).
 
 **Key:** 📄: table, 📈: time plot, 🧠: memory plot
+
 <!-- START table -->
 
 <!-- END table -->
@@ -27,6 +28,8 @@ The results have a resolution of 0.01 (1%).
 
 ![Configuration speed improvement](/configs.svg)
 
+There is also a [longitudinal plot by benchmark](/benchmarks.svg).
+
 ## Documentation
 
 ### Running benchmarks from the GitHub web UI
diff --git a/tests/data/bench_runner.toml b/tests/data/bench_runner.toml
@@ -21,6 +21,11 @@ subplots = [
     { name = "JIT", version = "3.14", head_flags = ["JIT"] },
 ]
 
+[benchmark_longitudinal_plot]
+base = "3.10.4"
+version = "3.11"
+runners = ["linux"]
+
 [publish_mirror]
 skip = false