NVIDIA
diff --git a/‎.github/workflows/coverage.yml‎
Lines changed: 17 additions & 34 deletions b/‎.github/workflows/coverage.yml‎
Lines changed: 17 additions & 34 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.spdx-ignore‎
Lines changed: 3 additions & 0 deletions b/‎.spdx-ignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ci/tools/run-tests‎
Lines changed: 8 additions & 8 deletions b/‎ci/tools/run-tests‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎ci/tools/run_pytest_with_stack.py‎
Lines changed: 54 additions & 0 deletions b/‎ci/tools/run_pytest_with_stack.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎cuda_bindings/benchmarks/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/README.md‎
Lines changed: 27 additions & 10 deletions b/‎cuda_bindings/benchmarks/README.md‎
Lines changed: 27 additions & 10 deletions
diff --git a/‎cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py‎
Lines changed: 62 additions & 0 deletions b/‎cuda_bindings/benchmarks/benchmarks/bench_ctx_device.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎cuda_bindings/benchmarks/benchmarks/bench_event.py‎
Lines changed: 62 additions & 0 deletions b/‎cuda_bindings/benchmarks/benchmarks/bench_event.py‎
Lines changed: 62 additions & 0 deletions
@@ -68,6 +68,10 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Fix workspace ownership
+        run: |
+          chown -R $(id -u):$(id -g) "$GITHUB_WORKSPACE"
+
       - name: Install dependencies
         uses: ./.github/actions/install_unix_deps
         continue-on-error: false
@@ -342,46 +346,25 @@ jobs:
           cd "${{ steps.install-root.outputs.INSTALL_ROOT }}"
           "$GITHUB_WORKSPACE/.venv/Scripts/pytest" -v --cov=./cuda --cov-append --cov-context=test --cov-config="$GITHUB_WORKSPACE/.coveragerc" "$GITHUB_WORKSPACE/cuda_pathfinder/tests"
 
+      # Cython linetrace under coverage on Windows needs more stack than the
+      # default 1 MB thread size.  The helper runs pytest on an 8 MB thread.
       - name: Run cuda.bindings tests (with 8MB stack)
         continue-on-error: true
         run: |
-          cd "${{ steps.install-root.outputs.INSTALL_ROOT }}"
-          # Run pytest in 8MB stack thread (Cython linetrace requirement)
-          "$GITHUB_WORKSPACE/.venv/Scripts/python" << PYTEST_EOF
-          import os
-          import sys
-          import threading
-          import pytest
-
-          os.chdir(r'${{ steps.install-root.outputs.INSTALL_ROOT }}')
-          threading.stack_size(8 * 1024 * 1024)
-          result = {'code': 1}
-
-          def _run():
-              workspace = os.environ['GITHUB_WORKSPACE']
-              result['code'] = pytest.main([
-                  '-v',
-                  '--cov=./cuda',
-                  '--cov-append',
-                  '--cov-context=test',
-                  f'--cov-config={workspace}/.coveragerc',
-                  f'{workspace}/cuda_bindings/tests'
-              ])
-
-          t = threading.Thread(target=_run)
-          t.start()
-          t.join()
-
-          print(f'Bindings tests exit code: {result["code"]}')
-          # Exit with actual code (continue-on-error handles it)
-          sys.exit(result['code'])
-          PYTEST_EOF
+          "$GITHUB_WORKSPACE/.venv/Scripts/python" "$GITHUB_WORKSPACE/ci/tools/run_pytest_with_stack.py" \
+            --cwd "${{ steps.install-root.outputs.INSTALL_ROOT }}" \
+            -v --cov=./cuda --cov-append --cov-context=test \
+            --cov-config="$GITHUB_WORKSPACE/.coveragerc" \
+            "$GITHUB_WORKSPACE/cuda_bindings/tests"
 
-      - name: Run cuda.core tests
+      - name: Run cuda.core tests (with 8MB stack)
         continue-on-error: true
         run: |
-          cd "${{ steps.install-root.outputs.INSTALL_ROOT }}"
-          "$GITHUB_WORKSPACE/.venv/Scripts/pytest" -v --cov=./cuda --cov-append --cov-context=test --cov-config="$GITHUB_WORKSPACE/.coveragerc" "$GITHUB_WORKSPACE/cuda_core/tests"
+          "$GITHUB_WORKSPACE/.venv/Scripts/python" "$GITHUB_WORKSPACE/ci/tools/run_pytest_with_stack.py" \
+            --cwd "${{ steps.install-root.outputs.INSTALL_ROOT }}" \
+            -v --cov=./cuda --cov-append --cov-context=test \
+            --cov-config="$GITHUB_WORKSPACE/.coveragerc" \
+            "$GITHUB_WORKSPACE/cuda_core/tests"
 
       - name: Copy Windows coverage file to workspace
         run: |
 
@@ -196,3 +196,4 @@ cython_debug/
 
 # Cursor
 .cursorrules
+.claude/settings.local.json
@@ -8,6 +8,9 @@ LICENSE
 requirements*.txt
 cuda_bindings/examples/*
 
+# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
+cuda_bindings/benchmarks/*
+
 # Vendored
 cuda_core/cuda/core/_include/dlpack.h
 
 
@@ -74,15 +74,15 @@ elif [[ "${test_module}" == "core" ]]; then
 
   pushd ./cuda_core
   CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
-  if [[ "${LOCAL_CTK}" == 1 ]]; then
-    # We already installed cuda-bindings, and all CTK components exist locally,
-    # so just install the test dependencies.
-    # Constrain cuda-toolkit to match the local CTK version to avoid
-    # pip pulling in a newer nvidia-cuda-runtime that conflicts with it.
-    pip install "${CUDA_CORE_ARTIFACTS_DIR}"/*.whl --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*"
-  else
-    pip install $(ls "${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)["cu${TEST_CUDA_MAJOR}"] --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}"
+  # Start from the built wheel path, then add the published cuda.bindings extra
+  # when this job is resolving against wheel-installed CTK packages.
+  WHL_EXTRA=("${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)
+  if [[ "${LOCAL_CTK}" != 1 ]]; then
+    WHL_EXTRA=("${WHL_EXTRA[0]}[cu${TEST_CUDA_MAJOR}]")
   fi
+  # Constrain cuda-toolkit to the requested CTK version to avoid
+  # pip pulling in a newer nvidia-cuda-runtime that conflicts with it.
+  pip install "${WHL_EXTRA[@]}" --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*"
   echo "Running core tests"
   ${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/
   # Currently our CI always installs the latest bindings (from either major version).
 
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""Run pytest on a thread with a larger stack size.
+
+Cython linetrace instrumentation under coverage on Windows can exceed the
+default 1 MB thread stack.  This helper spawns a single worker thread with
+a configurable stack (default 8 MB) so the rest of the CI workflow stays
+readable.
+
+Usage:
+    python run_pytest_with_stack.py [--stack-mb N] [--cwd DIR] [pytest args ...]
+"""
+
+import argparse
+import concurrent.futures
+import os
+import sys
+import threading
+
+import pytest
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--stack-mb",
+        type=int,
+        default=8,
+        help="Thread stack size in megabytes (default: 8)",
+    )
+    parser.add_argument(
+        "--cwd",
+        default=None,
+        help="Working directory for the test run",
+    )
+    args, pytest_args = parser.parse_known_args()
+
+    if args.cwd:
+        os.chdir(args.cwd)
+
+    threading.stack_size(args.stack_mb * 1024 * 1024)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+        code = pool.submit(pytest.main, pytest_args).result()
+
+    sys.exit(code)
+
+
+if __name__ == "__main__":
+    main()
@@ -11,3 +11,6 @@ __pycache__/
 
 # Override root .gitignore *.cpp rule (which targets Cython-generated files)
 !benchmarks/cpp/*.cpp
+
+results-python.json
+results-cpp.json
@@ -1,4 +1,17 @@
-# cuda.bindings Benchmarks
+# cuda.bindings benchmarks
+
+These benchmarks are intended to measure the latency overhead of calling CUDA
+Driver APIs through cuda.bindings, relative to a similar C++ baseline.
+
+The goal is to benchmark how much overhead does the Python layer adds to calling
+CUDA APIs and what operations are not in our target of less than 1us of overhead.
+
+Each Python benchmark has a C++ counterpart, which is used to compare the
+operations. We try to make each implementation perform small operations
+and nearly the same work as possible and are run under similar conditions.
+
+These are **not** throughput benchmarks to measure the overall performance
+of kernels and applications.
 
 ## Usage
 
@@ -32,26 +45,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune
 To run the benchmarks combine the environment and task:
 
 ```bash
-
 # Run the Python benchmarks in the wheel environment
 pixi run -e wheel bench
 
 # Run the Python benchmarks in the source environment
 pixi run -e source bench
 
-# Run the C++ benchmarks (environment is irrelavant here)
+# Run the C++ benchmarks
 pixi run -e wheel bench-cpp
 ```
 
-## pyperf JSON
+Both runners automatically save results to JSON files in the benchmarks
+directory: `results-python.json` and `results-cpp.json`.
 
-The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
-The results are written to a JSON file in the format expected by pyperf.
+## Output JSON and analysis
 
-The C++ benchmarks also generate a valid JSON file, in the same format.
+The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
+Both Python and C++ results are saved in pyperf-compatible JSON format,
+which can be analyzed with pyperf commands:
 
-```
-pixi run -e wheel bench-cpp -0 cpp.json
+```bash
+# Show results and statistics
+pixi run -e wheel -- python -m pyperf stats results-python.json
+pixi run -e wheel -- python -m pyperf stats results-cpp.json
 
-pixi run -e wheel pyperf stats cpp.json
+# Compare C++ vs Python results
+pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
 ```
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+CTX = ensure_context()
+
+_, DEVICE = cuda.cuDeviceGet(0)
+ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
+
+
+def bench_ctx_get_current(loops: int) -> float:
+    _cuCtxGetCurrent = cuda.cuCtxGetCurrent
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxGetCurrent()
+    return time.perf_counter() - t0
+
+
+def bench_ctx_set_current(loops: int) -> float:
+    _cuCtxSetCurrent = cuda.cuCtxSetCurrent
+    _ctx = CTX
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxSetCurrent(_ctx)
+    return time.perf_counter() - t0
+
+
+def bench_ctx_get_device(loops: int) -> float:
+    _cuCtxGetDevice = cuda.cuCtxGetDevice
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuCtxGetDevice()
+    return time.perf_counter() - t0
+
+
+def bench_device_get(loops: int) -> float:
+    _cuDeviceGet = cuda.cuDeviceGet
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuDeviceGet(0)
+    return time.perf_counter() - t0
+
+
+def bench_device_get_attribute(loops: int) -> float:
+    _cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
+    _attr = ATTRIBUTE
+    _dev = DEVICE
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuDeviceGetAttribute(_attr, _dev)
+    return time.perf_counter() - t0
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+
+from runner.runtime import ensure_context
+
+from cuda.bindings import driver as cuda
+
+ensure_context()
+
+_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
+_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
+
+cuda.cuEventRecord(EVENT, STREAM)
+cuda.cuStreamSynchronize(STREAM)
+
+EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
+
+
+def bench_event_create_destroy(loops: int) -> float:
+    _cuEventCreate = cuda.cuEventCreate
+    _cuEventDestroy = cuda.cuEventDestroy
+    _flags = EVENT_FLAGS
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _, e = _cuEventCreate(_flags)
+        _cuEventDestroy(e)
+    return time.perf_counter() - t0
+
+
+def bench_event_record(loops: int) -> float:
+    _cuEventRecord = cuda.cuEventRecord
+    _event = EVENT
+    _stream = STREAM
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventRecord(_event, _stream)
+    return time.perf_counter() - t0
+
+
+def bench_event_query(loops: int) -> float:
+    _cuEventQuery = cuda.cuEventQuery
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventQuery(_event)
+    return time.perf_counter() - t0
+
+
+def bench_event_synchronize(loops: int) -> float:
+    _cuEventSynchronize = cuda.cuEventSynchronize
+    _event = EVENT
+
+    t0 = time.perf_counter()
+    for _ in range(loops):
+        _cuEventSynchronize(_event)
+    return time.perf_counter() - t0
Original file line number	Diff line number	Diff line change
`@@ -196,3 +196,4 @@ cython_debug/`
`196`	`196`
`197`	`197`	`# Cursor`
`198`	`198`	`.cursorrules`
	`199`	`+.claude/settings.local.json`