Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions cuda_core/cuda/core/system/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ include "_fan.pxi"
include "_field_values.pxi"
include "_inforom.pxi"
include "_memory.pxi"
include "_nvlink.pxi"
include "_pci_info.pxi"
include "_performance.pxi"
include "_repair_status.pxi"
include "_temperature.pxi"
include "_utilization.pxi"


cdef class Device:
Expand Down Expand Up @@ -674,6 +676,18 @@ cdef class Device:
"""
return MemoryInfo(nvml.device_get_memory_info_v2(self._handle))

##########################################################################
# NVLINK
# See external class definitions in _nvlink.pxi

def nvlink(self, link: int) -> NvlinkInfo:
"""
Get information about NVLink on this device.

For devices with NVLink support.
"""
return NvlinkInfo(self, link)

##########################################################################
# PCI INFO
# See external class definitions in _pci_info.pxi
Expand Down Expand Up @@ -765,6 +779,30 @@ cdef class Device:
device._handle = handle
yield device

#######################################################################
# UTILIZATION

@property
def utilization(self) -> Utilization:
"""
Retrieves the current utilization rates for the device's major subsystems.

For Fermi &tm; or newer fully supported devices.

Note: During driver initialization when ECC is enabled one can see high
GPU and Memory Utilization readings. This is caused by ECC Memory
Scrubbing mechanism that is performed during driver initialization.

Note: On MIG-enabled GPUs, querying device utilization rates is not
currently supported.

Returns
-------
Utilization
An object containing the current utilization rates for the device.
"""
return Utilization(nvml.device_get_utilization_rates(self._handle))


def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel:
"""
Expand Down Expand Up @@ -853,6 +891,8 @@ __all__ = [
"InforomInfo",
"InforomObject",
"MemoryInfo",
"NvlinkInfo",
"NvlinkVersion",
"PcieUtilCounter",
"PciInfo",
"Pstates",
Expand All @@ -864,4 +904,5 @@ __all__ = [
"ThermalSensor",
"ThermalSettings",
"ThermalTarget",
"Utilization",
]
52 changes: 52 additions & 0 deletions cuda_core/cuda/core/system/_nvlink.pxi
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0


NvlinkVersion = nvml.NvlinkVersion


cdef class NvlinkInfo:
"""
Nvlink information for a device.
"""
cdef Device _device
cdef int _link

def __init__(self, device: Device, link: int):
self._device = device
self._link = link

@property
def version(self) -> NvLinkVersion:
"""
Retrieves the NvLink version for the device and link.

For all products with NvLink support.

Returns
-------

The NvLink version.
"""
return NvlinkVersion(nvml.device_get_nvlink_version(self._device._handle, self._link))

@property
def state(self) -> bool:
"""
Retrieves the state of the device's NvLink for the device and link specified.

For Pascal &tm; or newer fully supported devices.

For all products with NvLink support.

Returns
-------

`True` if the NvLink is active, `False` otherwise.
"""
return (
nvml.device_get_nvlink_state(self._device._handle, self._link) == nvml.EnableState.FEATURE_ENABLED
)

max_links = nvml.NVLINK_MAX_LINKS
29 changes: 29 additions & 0 deletions cuda_core/cuda/core/system/_utilization.pxi
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0


cdef class Utilization:
"""
Utilization rates for a device.

For devices with compute capability 2.0 or higher.
"""
cdef object _utilization

def __init__(self, utilization: nvml.Utilization):
self._utilization = utilization

@property
def gpu(self) -> int:
"""
Percent of time over the past sample period during which one or more kernels was executing on the GPU.
"""
return self._utilization.gpu

@property
def memory(self) -> int:
"""
Percent of time over the past sample period during which global (device) memory was being read or written.
"""
return self._utilization.memory
3 changes: 3 additions & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ Enums
system.FanControlPolicy
system.FieldId
system.InforomObject
system.NvlinkVersion
system.PcieUtilCounter
system.Pstates
system.TemperatureSensors
Expand Down Expand Up @@ -256,11 +257,13 @@ Types
system.GpuTopologyLevel
system.InforomInfo
system.MemoryInfo
system.NvlinkInfo
system.PciInfo
system.RepairStatus
system.Temperature
system.ThermalSensor
system.ThermalSettings
system.Utilization

.. module:: cuda.core.utils

Expand Down
31 changes: 31 additions & 0 deletions cuda_core/tests/system/test_system_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,3 +729,34 @@ def test_pstates():
assert isinstance(utilization.percentage, int)
assert isinstance(utilization.inc_threshold, int)
assert isinstance(utilization.dec_threshold, int)


def test_nvlink():
for device in system.Device.get_all_devices():
max_links = system.NvlinkInfo.max_links
assert isinstance(max_links, int)
assert max_links > 0

for link in range(max_links):
with unsupported_before(device, None):
nvlink_info = device.nvlink(link)
assert isinstance(nvlink_info, system.NvlinkInfo)

with unsupported_before(device, None):
version = nvlink_info.version
assert isinstance(version, system.NvlinkVersion)


def test_utilization():
for device in system.Device.get_all_devices():
with unsupported_before(device, None):
utilization = device.utilization
assert isinstance(utilization, system.Utilization)

gpu = utilization.gpu
assert isinstance(gpu, int)
assert 0 <= gpu <= 100

memory = utilization.memory
assert isinstance(memory, int)
assert 0 <= memory <= 100
Loading