Skip to content

Commit ac86822

Browse files
authored
Merge branch 'main' into cuda-core-system-jupyterlab-nvdashboard
2 parents 9098293 + d818a75 commit ac86822

File tree

75 files changed

+2348
-446
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+2348
-446
lines changed

.github/workflows/coverage.yml

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ jobs:
6868
with:
6969
fetch-depth: 0
7070

71+
- name: Fix workspace ownership
72+
run: |
73+
chown -R $(id -u):$(id -g) "$GITHUB_WORKSPACE"
74+
7175
- name: Install dependencies
7276
uses: ./.github/actions/install_unix_deps
7377
continue-on-error: false
@@ -342,46 +346,25 @@ jobs:
342346
cd "${{ steps.install-root.outputs.INSTALL_ROOT }}"
343347
"$GITHUB_WORKSPACE/.venv/Scripts/pytest" -v --cov=./cuda --cov-append --cov-context=test --cov-config="$GITHUB_WORKSPACE/.coveragerc" "$GITHUB_WORKSPACE/cuda_pathfinder/tests"
344348
349+
# Cython linetrace under coverage on Windows needs more stack than the
350+
# default 1 MB thread size. The helper runs pytest on an 8 MB thread.
345351
- name: Run cuda.bindings tests (with 8MB stack)
346352
continue-on-error: true
347353
run: |
348-
cd "${{ steps.install-root.outputs.INSTALL_ROOT }}"
349-
# Run pytest in 8MB stack thread (Cython linetrace requirement)
350-
"$GITHUB_WORKSPACE/.venv/Scripts/python" << PYTEST_EOF
351-
import os
352-
import sys
353-
import threading
354-
import pytest
355-
356-
os.chdir(r'${{ steps.install-root.outputs.INSTALL_ROOT }}')
357-
threading.stack_size(8 * 1024 * 1024)
358-
result = {'code': 1}
359-
360-
def _run():
361-
workspace = os.environ['GITHUB_WORKSPACE']
362-
result['code'] = pytest.main([
363-
'-v',
364-
'--cov=./cuda',
365-
'--cov-append',
366-
'--cov-context=test',
367-
f'--cov-config={workspace}/.coveragerc',
368-
f'{workspace}/cuda_bindings/tests'
369-
])
370-
371-
t = threading.Thread(target=_run)
372-
t.start()
373-
t.join()
374-
375-
print(f'Bindings tests exit code: {result["code"]}')
376-
# Exit with actual code (continue-on-error handles it)
377-
sys.exit(result['code'])
378-
PYTEST_EOF
354+
"$GITHUB_WORKSPACE/.venv/Scripts/python" "$GITHUB_WORKSPACE/ci/tools/run_pytest_with_stack.py" \
355+
--cwd "${{ steps.install-root.outputs.INSTALL_ROOT }}" \
356+
-v --cov=./cuda --cov-append --cov-context=test \
357+
--cov-config="$GITHUB_WORKSPACE/.coveragerc" \
358+
"$GITHUB_WORKSPACE/cuda_bindings/tests"
379359
380-
- name: Run cuda.core tests
360+
- name: Run cuda.core tests (with 8MB stack)
381361
continue-on-error: true
382362
run: |
383-
cd "${{ steps.install-root.outputs.INSTALL_ROOT }}"
384-
"$GITHUB_WORKSPACE/.venv/Scripts/pytest" -v --cov=./cuda --cov-append --cov-context=test --cov-config="$GITHUB_WORKSPACE/.coveragerc" "$GITHUB_WORKSPACE/cuda_core/tests"
363+
"$GITHUB_WORKSPACE/.venv/Scripts/python" "$GITHUB_WORKSPACE/ci/tools/run_pytest_with_stack.py" \
364+
--cwd "${{ steps.install-root.outputs.INSTALL_ROOT }}" \
365+
-v --cov=./cuda --cov-append --cov-context=test \
366+
--cov-config="$GITHUB_WORKSPACE/.coveragerc" \
367+
"$GITHUB_WORKSPACE/cuda_core/tests"
385368
386369
- name: Copy Windows coverage file to workspace
387370
run: |

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,3 +196,4 @@ cython_debug/
196196

197197
# Cursor
198198
.cursorrules
199+
.claude/settings.local.json

.spdx-ignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ LICENSE
88
requirements*.txt
99
cuda_bindings/examples/*
1010

11+
# Will be moved in (see https://github.com/NVIDIA/cuda-python/pull/1913#issuecomment-4252968149)
12+
cuda_bindings/benchmarks/*
13+
1114
# Vendored
1215
cuda_core/cuda/core/_include/dlpack.h
1316

ci/tools/run-tests

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,15 @@ elif [[ "${test_module}" == "core" ]]; then
7474

7575
pushd ./cuda_core
7676
CUDA_VER_MINOR="$(cut -d '.' -f 1-2 <<< "${CUDA_VER}")"
77-
if [[ "${LOCAL_CTK}" == 1 ]]; then
78-
# We already installed cuda-bindings, and all CTK components exist locally,
79-
# so just install the test dependencies.
80-
# Constrain cuda-toolkit to match the local CTK version to avoid
81-
# pip pulling in a newer nvidia-cuda-runtime that conflicts with it.
82-
pip install "${CUDA_CORE_ARTIFACTS_DIR}"/*.whl --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*"
83-
else
84-
pip install $(ls "${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)["cu${TEST_CUDA_MAJOR}"] --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}"
77+
# Start from the built wheel path, then add the published cuda.bindings extra
78+
# when this job is resolving against wheel-installed CTK packages.
79+
WHL_EXTRA=("${CUDA_CORE_ARTIFACTS_DIR}"/*.whl)
80+
if [[ "${LOCAL_CTK}" != 1 ]]; then
81+
WHL_EXTRA=("${WHL_EXTRA[0]}[cu${TEST_CUDA_MAJOR}]")
8582
fi
83+
# Constrain cuda-toolkit to the requested CTK version to avoid
84+
# pip pulling in a newer nvidia-cuda-runtime that conflicts with it.
85+
pip install "${WHL_EXTRA[@]}" --group "test-cu${TEST_CUDA_MAJOR}${FREE_THREADING}" "cuda-toolkit==${CUDA_VER_MINOR}.*"
8686
echo "Running core tests"
8787
${SANITIZER_CMD} pytest -rxXs -v --durations=0 --randomly-dont-reorganize tests/
8888
# Currently our CI always installs the latest bindings (from either major version).

ci/tools/run_pytest_with_stack.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/env python3
2+
3+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
"""Run pytest on a thread with a larger stack size.
8+
9+
Cython linetrace instrumentation under coverage on Windows can exceed the
10+
default 1 MB thread stack. This helper spawns a single worker thread with
11+
a configurable stack (default 8 MB) so the rest of the CI workflow stays
12+
readable.
13+
14+
Usage:
15+
python run_pytest_with_stack.py [--stack-mb N] [--cwd DIR] [pytest args ...]
16+
"""
17+
18+
import argparse
19+
import concurrent.futures
20+
import os
21+
import sys
22+
import threading
23+
24+
import pytest
25+
26+
27+
def main():
28+
parser = argparse.ArgumentParser(description=__doc__)
29+
parser.add_argument(
30+
"--stack-mb",
31+
type=int,
32+
default=8,
33+
help="Thread stack size in megabytes (default: 8)",
34+
)
35+
parser.add_argument(
36+
"--cwd",
37+
default=None,
38+
help="Working directory for the test run",
39+
)
40+
args, pytest_args = parser.parse_known_args()
41+
42+
if args.cwd:
43+
os.chdir(args.cwd)
44+
45+
threading.stack_size(args.stack_mb * 1024 * 1024)
46+
47+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
48+
code = pool.submit(pytest.main, pytest_args).result()
49+
50+
sys.exit(code)
51+
52+
53+
if __name__ == "__main__":
54+
main()

cuda_bindings/benchmarks/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ __pycache__/
1111

1212
# Override root .gitignore *.cpp rule (which targets Cython-generated files)
1313
!benchmarks/cpp/*.cpp
14+
15+
results-python.json
16+
results-cpp.json

cuda_bindings/benchmarks/README.md

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
1-
# cuda.bindings Benchmarks
1+
# cuda.bindings benchmarks
2+
3+
These benchmarks are intended to measure the latency overhead of calling CUDA
4+
Driver APIs through cuda.bindings, relative to a similar C++ baseline.
5+
6+
The goal is to benchmark how much overhead does the Python layer adds to calling
7+
CUDA APIs and what operations are not in our target of less than 1us of overhead.
8+
9+
Each Python benchmark has a C++ counterpart, which is used to compare the
10+
operations. We try to make each implementation perform small operations
11+
and nearly the same work as possible and are run under similar conditions.
12+
13+
These are **not** throughput benchmarks to measure the overall performance
14+
of kernels and applications.
215

316
## Usage
417

@@ -32,26 +45,30 @@ sudo $(pixi run -e wheel -- which python) -m pyperf system tune
3245
To run the benchmarks combine the environment and task:
3346

3447
```bash
35-
3648
# Run the Python benchmarks in the wheel environment
3749
pixi run -e wheel bench
3850

3951
# Run the Python benchmarks in the source environment
4052
pixi run -e source bench
4153

42-
# Run the C++ benchmarks (environment is irrelavant here)
54+
# Run the C++ benchmarks
4355
pixi run -e wheel bench-cpp
4456
```
4557

46-
## pyperf JSON
58+
Both runners automatically save results to JSON files in the benchmarks
59+
directory: `results-python.json` and `results-cpp.json`.
4760

48-
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
49-
The results are written to a JSON file in the format expected by pyperf.
61+
## Output JSON and analysis
5062

51-
The C++ benchmarks also generate a valid JSON file, in the same format.
63+
The benchmarks are run using [pyperf](https://pyperf.readthedocs.io/en/latest/).
64+
Both Python and C++ results are saved in pyperf-compatible JSON format,
65+
which can be analyzed with pyperf commands:
5266

53-
```
54-
pixi run -e wheel bench-cpp -0 cpp.json
67+
```bash
68+
# Show results and statistics
69+
pixi run -e wheel -- python -m pyperf stats results-python.json
70+
pixi run -e wheel -- python -m pyperf stats results-cpp.json
5571

56-
pixi run -e wheel pyperf stats cpp.json
72+
# Compare C++ vs Python results
73+
pixi run -e wheel -- python -m pyperf compare_to results-cpp.json results-python.json
5774
```
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
CTX = ensure_context()
12+
13+
_, DEVICE = cuda.cuDeviceGet(0)
14+
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
15+
16+
17+
def bench_ctx_get_current(loops: int) -> float:
18+
_cuCtxGetCurrent = cuda.cuCtxGetCurrent
19+
20+
t0 = time.perf_counter()
21+
for _ in range(loops):
22+
_cuCtxGetCurrent()
23+
return time.perf_counter() - t0
24+
25+
26+
def bench_ctx_set_current(loops: int) -> float:
27+
_cuCtxSetCurrent = cuda.cuCtxSetCurrent
28+
_ctx = CTX
29+
30+
t0 = time.perf_counter()
31+
for _ in range(loops):
32+
_cuCtxSetCurrent(_ctx)
33+
return time.perf_counter() - t0
34+
35+
36+
def bench_ctx_get_device(loops: int) -> float:
37+
_cuCtxGetDevice = cuda.cuCtxGetDevice
38+
39+
t0 = time.perf_counter()
40+
for _ in range(loops):
41+
_cuCtxGetDevice()
42+
return time.perf_counter() - t0
43+
44+
45+
def bench_device_get(loops: int) -> float:
46+
_cuDeviceGet = cuda.cuDeviceGet
47+
48+
t0 = time.perf_counter()
49+
for _ in range(loops):
50+
_cuDeviceGet(0)
51+
return time.perf_counter() - t0
52+
53+
54+
def bench_device_get_attribute(loops: int) -> float:
55+
_cuDeviceGetAttribute = cuda.cuDeviceGetAttribute
56+
_attr = ATTRIBUTE
57+
_dev = DEVICE
58+
59+
t0 = time.perf_counter()
60+
for _ in range(loops):
61+
_cuDeviceGetAttribute(_attr, _dev)
62+
return time.perf_counter() - t0
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from runner.runtime import ensure_context
8+
9+
from cuda.bindings import driver as cuda
10+
11+
ensure_context()
12+
13+
_err, STREAM = cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)
14+
_err, EVENT = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value)
15+
16+
cuda.cuEventRecord(EVENT, STREAM)
17+
cuda.cuStreamSynchronize(STREAM)
18+
19+
EVENT_FLAGS = cuda.CUevent_flags.CU_EVENT_DISABLE_TIMING.value
20+
21+
22+
def bench_event_create_destroy(loops: int) -> float:
23+
_cuEventCreate = cuda.cuEventCreate
24+
_cuEventDestroy = cuda.cuEventDestroy
25+
_flags = EVENT_FLAGS
26+
27+
t0 = time.perf_counter()
28+
for _ in range(loops):
29+
_, e = _cuEventCreate(_flags)
30+
_cuEventDestroy(e)
31+
return time.perf_counter() - t0
32+
33+
34+
def bench_event_record(loops: int) -> float:
35+
_cuEventRecord = cuda.cuEventRecord
36+
_event = EVENT
37+
_stream = STREAM
38+
39+
t0 = time.perf_counter()
40+
for _ in range(loops):
41+
_cuEventRecord(_event, _stream)
42+
return time.perf_counter() - t0
43+
44+
45+
def bench_event_query(loops: int) -> float:
46+
_cuEventQuery = cuda.cuEventQuery
47+
_event = EVENT
48+
49+
t0 = time.perf_counter()
50+
for _ in range(loops):
51+
_cuEventQuery(_event)
52+
return time.perf_counter() - t0
53+
54+
55+
def bench_event_synchronize(loops: int) -> float:
56+
_cuEventSynchronize = cuda.cuEventSynchronize
57+
_event = EVENT
58+
59+
t0 = time.perf_counter()
60+
for _ in range(loops):
61+
_cuEventSynchronize(_event)
62+
return time.perf_counter() - t0

0 commit comments

Comments
 (0)