pytorch-level profiler

ngc92 · ngc92 · commit 7d355a6706e3 · 2025-05-13T13:57:36.000+02:00
diff --git a/examples/eval.py b/examples/eval.py
@@ -1,3 +1,4 @@
+import base64
 import dataclasses
 import multiprocessing
 import re
@@ -137,6 +138,17 @@ def _clone_data(data):
         return data
 
 
+def wrap_check_implementation(data, submission_output):
+    # Old version returned just a single string, new version
+    # returns (bool, str); this function ensures compatibility with old
+    # problem definitions.
+    result = check_implementation(data, submission_output)
+    if isinstance(result, tuple):
+        return result
+    else:
+        return not bool(result), result
+
+
 def _run_single_test(test: TestCase):
     """
     Runs a single test case. Do not call directly
@@ -146,7 +158,7 @@ def _run_single_test(test: TestCase):
     torch.cuda.synchronize()
     submission_output = custom_kernel(_clone_data(data))
     torch.cuda.synchronize()
-    return check_implementation(data, submission_output)
+    return wrap_check_implementation(data, submission_output)
 
 
 def run_single_test(pool: multiprocessing.Pool, test: TestCase):
@@ -168,13 +180,15 @@ def run_testing(logger: PopcornOutput, pool: multiprocessing.Pool, tests: list[T
     logger.log("test-count", len(tests))
     for idx, test in enumerate(tests):
         logger.log(f"test.{idx}.spec", test.spec)
-        error = run_single_test(pool, test)
-        if error:
+        good, message = run_single_test(pool, test)
+        if not good:
             logger.log(f"test.{idx}.status", "fail")
-            logger.log(f"test.{idx}.error", error)
+            logger.log(f"test.{idx}.error", message)
             passed = False
         else:
             logger.log(f"test.{idx}.status", "pass")
+            if message:
+                logger.log(f"test.{idx}.message", message)
 
     if passed:
         logger.log("check", "pass")
@@ -196,9 +210,9 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
     check_copy = _clone_data(data)
     #  first, one obligatory correctness check
     output = custom_kernel(data)
-    error = check_implementation(check_copy, output)
-    if error:
-        return error
+    good, message = wrap_check_implementation(check_copy, output)
+    if not good:
+        return message
 
     # now, do multiple timing runs without further correctness testing
     # there is an upper bound of 100 runs, and a lower bound of 3 runs;
@@ -220,16 +234,16 @@ def _run_single_benchmark(test: TestCase, recheck: bool, max_repeats: int, max_t
         end = time.perf_counter_ns()
 
         if recheck:
-            error = check_implementation(check_copy, output)
-            if error:
-                return error
+            good, message = check_implementation(check_copy, output)
+            if not good:
+                return message
 
         del output
         durations.append(end-start)
 
         if i > 1:
             stats = calculate_stats(durations)
-            if stats.err / stats.mean < 0.01 or stats.mean * stats.runs > max_time_ns:
+            if stats.err / stats.mean < 0.001 or stats.mean * stats.runs > max_time_ns:
                 break
 
     return calculate_stats(durations)
@@ -282,6 +296,31 @@ def run_benchmarking(logger: PopcornOutput, pool: multiprocessing.Pool, tests: l
         return 112
 
 
+def run_single_profile(test: TestCase) -> str:
+    """
+    Runs a single test case. Do not call directly
+    """
+    from submission import custom_kernel
+    from torch.profiler import profile, record_function, ProfilerActivity
+    data = generate_input(**test.args)
+    torch.cuda.synchronize()
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        submission_output = custom_kernel(_clone_data(data))
+        torch.cuda.synchronize()
+    return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
+
+
+def run_profiling(logger: PopcornOutput, tests: list[TestCase]):
+    logger.log("benchmark-count", len(tests))
+    for idx, test in enumerate(tests):
+        logger.log(f"benchmark.{idx}.spec", test.spec)
+        report = run_single_profile(test)
+        logger.log(f"benchmark.{idx}.report", base64.b64encode(report.encode("utf-8"), b"+*").decode("utf-8"))
+    logger.log("check", "pass")
+    return 0
+
+
 def main():
     fd = os.getenv("POPCORN_FD")
     if not fd:
@@ -324,8 +363,10 @@ def main():
                         break
 
                 logger.log("check", "pass" if passed else "fail")
+            elif mode == "profile":
+                run_profiling(logger, tests)
             else:
-                # TODO: Implement script and profile mode
+                # TODO: Implement script mode
                 return 2
 
 
diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py
@@ -279,6 +279,25 @@ async def submit_bench(
             interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu
         )
 
+    @app_commands.command(name="profile", description="Start a profiling run")
+    @app_commands.describe(
+        leaderboard_name="Name of the competition / kernel to optimize",
+        script="The Python / CUDA script file to run",
+        gpu="Select GPU. Leave empty for interactive or automatic selection.",
+    )
+    @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
+    @with_error_handling
+    async def submit_profile(
+        self,
+        interaction: discord.Interaction,
+        script: discord.Attachment,
+        leaderboard_name: Optional[str],
+        gpu: Optional[str],
+    ):
+        return await self.submit(
+            interaction, leaderboard_name, script, mode=SubmissionMode.PROFILE, gpu=gpu
+        )
+
     @app_commands.command(
         name="ranked", description="Start a ranked run for an official leaderboard submission"
     )
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
@@ -1,4 +1,6 @@
+import base64
 import dataclasses
+import textwrap
 from typing import List
 
 import consts
@@ -195,6 +197,17 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]:  # n
     elif full:
         result.append("❌ Benchmarks missing")
 
+    if "profile" in runs:
+        bench_run = runs["profile"].run
+        if not bench_run.success:
+            result.append("❌ Running profile failed" + _short_fail_reason(bench_run))
+            return result
+        elif not bench_run.passed:
+            result.append("❌ Profiling failed")
+            return result
+        else:
+            result.append("✅ Profiling successful")
+
     if "leaderboard" in runs:
         lb_run = runs["leaderboard"].run
         if not lb_run.success:
@@ -263,6 +276,29 @@ def log_one(base_name):
         return "❗ Could not find any benchmarks"
 
 
+def make_profile_log(run: RunResult) -> str:
+    num_bench = int(run.result.get("benchmark-count", 0))
+
+    def log_one(base_name):
+        spec = run.result.get(f"{base_name}.spec")
+
+        report: str = run.result.get(f"{base_name}.report")
+        report = base64.b64decode(report.encode("utf-8"), b"+*").decode("utf-8")
+        report = textwrap.indent(report, "  ")
+        bench_log.append(f"{spec}\n")
+        bench_log.append(report)
+
+    bench_log = []
+    for i in range(num_bench):
+        log_one(f"benchmark.{i}")
+        bench_log.append("")
+
+    if len(bench_log) > 0:
+        return "\n".join(bench_log)
+    else:
+        return "❗ Could not find any profiling data"
+
+
 def generate_system_info(system: SystemInfo):
     return f"""
 Running on:
@@ -314,6 +350,22 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_benchmark_log(bench_run),
         )
 
+    if "profile" in runs:
+        prof_run = runs["profile"]
+        if prof_run.compilation is not None and not prof_run.compilation.success:
+            _generate_compile_report(report, prof_run.compilation)
+            return report
+
+        prof_run = prof_run.run
+        if not prof_run.success:
+            _generate_crash_report(report, prof_run)
+            return report
+
+        report.add_log(
+            "Profiling",
+            make_profile_log(prof_run),
+        )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if bench_run.compilation is not None and not bench_run.compilation.success: