Make flag effect plots with runner_map work:

Yhg1s · Yhg1s · commit f7b41df822b3 · 2025-04-27T16:35:16.000+02:00
- When using a runner_map, force benchmark comparisons to be valid, since
   results from a different runner would otherwise not count.
 - Add version to flag effect plots so we don't end up comparing e.g.
   v3.14.0a7 and v3.13.10, which were created 15 minutes apart but do not
   make sense to compare even with a runner_map.
 - If a runner_map is used, don't plot runners that aren't listed in the
   runner_map.
diff --git a/README.md b/README.md
@@ -169,6 +169,7 @@ The flag effect plot shows the effect of specified configuration flags against a
 In `bench_runner.toml`, the `flag_effect_plot` table has a `subplots` key which is an array of tables with the following keys:
 
 - `name`: The description of the flags to use in the title.
+- `version`: The version series to compare. Should be a 2-part version, e.g. "3.14"
 - `head_flags`: A list of flags to use as the head.
 - `base_flags`: (optional) A list of flags to use as the base. By default, this is a default build, i.e. no flags.
 - `runner_map`: (optional) If you need to map a runner to a base in a different runner, you can provide that mapping here. For example, with tail-calling, you may want to compare runners configured to use clang against runners configured with the "default compiler" for a given platform. The mapping is from the "head" runner nickname to the "base" runner nickname.
@@ -178,10 +179,12 @@ For example:
 ```toml
 [[flag_effect_plot.subplots]]
 name = "JIT"
+version = "3.14"
 head_flags = ["JIT"]
 
 [[flag_effect_plot.subplots]]
 name = "Tail calling interpreter"
+version = "3.14"
 head_flags = ["TAILCALL"]
 runner_map = { linux_clang = "linux" }
 ```
diff --git a/bench_runner/plot.py b/bench_runner/plot.py
@@ -105,6 +105,7 @@ def get_flag_effect_plot_config():
 
     for subplot in subplots:
         assert "name" in subplot
+        assert "version" in subplot
         assert "head_flags" in subplot
         subplot["head_flags"] = sorted(set(subplot["head_flags"]))
         if "base_flags" not in subplot:
@@ -428,12 +429,14 @@ def flag_effect_plot(
         print("No flag effect plot config found. Skipping.")
         return
 
-    def get_comparison_value(ref, r):
+    def get_comparison_value(ref, r, force_valid):
         key = ",".join((str(ref.filename)[8:], str(r.filename)[8:]))
         if key in data:
             return data[key]
         else:
-            value = getter(result.BenchmarkComparison(ref, r, "default"))
+            value = getter(
+                result.BenchmarkComparison(ref, r, "default", force_valid=force_valid)
+            )
             data[key] = value
             return value
 
@@ -464,8 +467,15 @@ def get_comparison_value(ref, r):
 
     for subplot, ax in zip(subplots, axs):
         ax.set_title(f"Effect of {subplot['name']}")
+        version = tuple(int(x) for x in subplot["version"].split("."))
+        assert len(version) == 2, (
+            "Version config in {subplot['name']}" " should only be major.minor"
+        )
 
         for runner in mrunners.get_runners():
+            runner_is_mapped = runner.nickname in subplot["runner_map"]
+            if subplot["runner_map"] and not runner_is_mapped:
+                continue
             head_results = commits.get(runner.nickname, {}).get(
                 tuple(subplot["head_flags"]), {}
             )
@@ -476,10 +486,14 @@ def get_comparison_value(ref, r):
             line = []
             for cpython_hash, r in head_results.items():
                 if cpython_hash in base_results:
+                    if r.parsed_version.release[0:2] != version:
+                        continue
                     line.append(
                         (
                             r.commit_datetime,
-                            get_comparison_value(base_results[cpython_hash], r),
+                            get_comparison_value(
+                                base_results[cpython_hash], r, runner_is_mapped
+                            ),
                         )
                     )
             line.sort(key=lambda x: datetime.datetime.fromisoformat(x[0]))