Skip to content

Commit 4d8ee87

Browse files
rparolinclaude
andauthored
Fix managed memory misclassified as kDLCUDAHost in DLPack device mapping (#1863)
* Fix managed memory incorrectly classified as kDLCUDAHost in DLPack device mapping _smv_get_dl_device() treated all buffers that are both device- and host-accessible as kDLCUDAHost. Managed (unified) memory is also both- accessible, so it was misclassified. CCCL's make_tma_descriptor then rejected the descriptor with "Device type must be kDLCUDA or kDLCUDAManaged". Preserve the is_managed flag already queried via CU_POINTER_ATTRIBUTE_IS_MANAGED in _query_memory_attrs(), expose it on Buffer, and use it in _smv_get_dl_device() to return kDLCUDAManaged for managed memory. Fixes: https://nvbugspro.nvidia.com/bug/6044342 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix managed memory DLPack device type on buffer-side export paths Update setup_dl_tensor_device() and Buffer.__dlpack_device__() to emit kDLCUDAManaged for managed memory, closing the gap where the Buffer -> DLPack capsule -> StridedMemoryView path still misclassified managed buffers as kDLCUDAHost. Add cross-reference comments to keep the three classification sites aligned. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Centralize DLPack device classification into classify_dl_device() Extract the duplicated device-type mapping logic from Buffer.__dlpack_device__(), setup_dl_tensor_device(), and _smv_get_dl_device() into a single classify_dl_device() function in _dlpack.pyx. All three call sites now delegate to it. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Remove unused DLDeviceType import from _buffer.pyx Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Update tests for managed memory DLPack device classification - Fix test_buffer_dunder_dlpack_device_success to expect kDLCUDAManaged for unified memory instead of the old buggy kDLCUDAHost. - Fix test_buffer_dlpack_failure_clean_up error message to match the unified classify_dl_device error. - Add test_managed_buffer_dlpack_roundtrip_device_type to cover the Buffer -> DLPack capsule -> StridedMemoryView end-to-end path. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d393729 commit 4d8ee87

5 files changed

Lines changed: 55 additions & 41 deletions

File tree

cuda_core/cuda/core/_dlpack.pyx

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,20 +88,28 @@ cdef inline int setup_dl_tensor_layout(DLTensor* dl_tensor, object buf) except -
8888
return 0
8989

9090

91+
def classify_dl_device(buf) -> tuple[int, int]:
92+
"""Classify a buffer into a DLPack (device_type, device_id) pair.
93+
94+
``buf`` must expose ``is_device_accessible``, ``is_host_accessible``,
95+
``is_managed``, and ``device_id`` attributes.
96+
"""
97+
cdef bint d = buf.is_device_accessible
98+
cdef bint h = buf.is_host_accessible
99+
if d and not h:
100+
return (_kDLCUDA, buf.device_id)
101+
if d and h:
102+
return (_kDLCUDAManaged if buf.is_managed else _kDLCUDAHost, 0)
103+
if not d and h:
104+
return (_kDLCPU, 0)
105+
raise BufferError("buffer is neither device-accessible nor host-accessible")
106+
107+
91108
cdef inline int setup_dl_tensor_device(DLTensor* dl_tensor, object buf) except -1:
92109
cdef DLDevice* device = &dl_tensor.device
93-
# buf should be a Buffer instance
94-
if buf.is_device_accessible and not buf.is_host_accessible:
95-
device.device_type = _kDLCUDA
96-
device.device_id = buf.device_id
97-
elif buf.is_device_accessible and buf.is_host_accessible:
98-
device.device_type = _kDLCUDAHost
99-
device.device_id = 0
100-
elif not buf.is_device_accessible and buf.is_host_accessible:
101-
device.device_type = _kDLCPU
102-
device.device_id = 0
103-
else: # not buf.is_device_accessible and not buf.is_host_accessible
104-
raise BufferError("invalid buffer")
110+
dev_type, dev_id = classify_dl_device(buf)
111+
device.device_type = <_DLDeviceType>dev_type
112+
device.device_id = <int32_t>dev_id
105113
return 0
106114

107115

cuda_core/cuda/core/_memory/_buffer.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ cdef struct _MemAttrs:
1212
int device_id
1313
bint is_device_accessible
1414
bint is_host_accessible
15+
bint is_managed
1516

1617

1718
cdef class Buffer:

cuda_core/cuda/core/_memory/_buffer.pyx

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ if sys.version_info >= (3, 12):
3434
else:
3535
BufferProtocol = object
3636

37-
from cuda.core._dlpack import DLDeviceType, make_py_capsule
37+
from cuda.core._dlpack import classify_dl_device, make_py_capsule
3838
from cuda.core._utils.cuda_utils import driver
3939
from cuda.core._device import Device
4040

@@ -323,16 +323,7 @@ cdef class Buffer:
323323
return capsule
324324

325325
def __dlpack_device__(self) -> tuple[int, int]:
326-
cdef bint d = self.is_device_accessible
327-
cdef bint h = self.is_host_accessible
328-
if d and (not h):
329-
return (DLDeviceType.kDLCUDA, self.device_id)
330-
if d and h:
331-
# TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
332-
return (DLDeviceType.kDLCUDAHost, 0)
333-
if (not d) and h:
334-
return (DLDeviceType.kDLCPU, 0)
335-
raise BufferError("buffer is neither device-accessible nor host-accessible")
326+
return classify_dl_device(self)
336327

337328
def __buffer__(self, flags: int, /) -> memoryview:
338329
# Support for Python-level buffer protocol as per PEP 688.
@@ -396,6 +387,12 @@ cdef class Buffer:
396387
_init_mem_attrs(self)
397388
return self._mem_attrs.is_host_accessible
398389

390+
@property
391+
def is_managed(self) -> bool:
392+
"""Return True if this buffer is CUDA managed (unified) memory, otherwise False."""
393+
_init_mem_attrs(self)
394+
return self._mem_attrs.is_managed
395+
399396
@property
400397
def is_mapped(self) -> bool:
401398
"""Return True if this buffer is mapped into the process via IPC."""
@@ -459,6 +456,7 @@ cdef inline int _query_memory_attrs(
459456
out.is_host_accessible = True
460457
out.is_device_accessible = False
461458
out.device_id = -1
459+
out.is_managed = False
462460
elif (
463461
is_managed
464462
or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
@@ -467,10 +465,12 @@ cdef inline int _query_memory_attrs(
467465
out.is_host_accessible = True
468466
out.is_device_accessible = True
469467
out.device_id = device_id
468+
out.is_managed = is_managed
470469
elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE:
471470
out.is_host_accessible = False
472471
out.is_device_accessible = True
473472
out.device_id = device_id
473+
out.is_managed = False
474474
else:
475475
with cython.gil:
476476
raise ValueError(f"Unsupported memory type: {memory_type}")

cuda_core/cuda/core/_memoryview.pyx

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from __future__ import annotations
66

77
from ._dlpack cimport *
8+
from ._dlpack import classify_dl_device
89
from libc.stdint cimport intptr_t
910
from cuda.core._layout cimport _StridedLayout, get_strides_ptr
1011
from cuda.core._stream import Stream
@@ -590,8 +591,6 @@ cdef inline int _smv_get_dl_device(
590591
cdef _DLDeviceType device_type
591592
cdef int32_t device_id
592593
cdef object buf
593-
cdef bint d
594-
cdef bint h
595594
if view.dl_tensor != NULL:
596595
device_type = view.dl_tensor.device.device_type
597596
if device_type == _kDLCUDA:
@@ -601,20 +600,9 @@ cdef inline int _smv_get_dl_device(
601600
device_id = 0
602601
elif view.is_device_accessible:
603602
buf = view.get_buffer()
604-
d = buf.is_device_accessible
605-
h = buf.is_host_accessible
606-
if d and (not h):
607-
device_type = _kDLCUDA
608-
device_id = buf.device_id
609-
elif d and h:
610-
# We do not currently differentiate pinned vs managed here.
611-
device_type = _kDLCUDAHost
612-
device_id = 0
613-
elif (not d) and h:
614-
device_type = _kDLCPU
615-
device_id = 0
616-
else:
617-
raise BufferError("buffer is neither device-accessible nor host-accessible")
603+
dev_type, dev_id = classify_dl_device(buf)
604+
device_type = <_DLDeviceType>dev_type
605+
device_id = <int32_t>dev_id
618606
else:
619607
device_type = _kDLCPU
620608
device_id = 0

cuda_core/tests/test_memory.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ def test_buffer_dunder_dlpack():
556556
[
557557
(DummyDeviceMemoryResource, (DLDeviceType.kDLCUDA, 0)),
558558
(DummyHostMemoryResource, (DLDeviceType.kDLCPU, 0)),
559-
(DummyUnifiedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
559+
(DummyUnifiedMemoryResource, (DLDeviceType.kDLCUDAManaged, 0)),
560560
(DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
561561
],
562562
)
@@ -579,7 +579,7 @@ def test_buffer_dlpack_failure_clean_up():
579579
dummy_mr = NullMemoryResource()
580580
buffer = dummy_mr.allocate(size=1024)
581581
before = sys.getrefcount(buffer)
582-
with pytest.raises(BufferError, match="invalid buffer"):
582+
with pytest.raises(BufferError, match="buffer is neither device-accessible nor host-accessible"):
583583
buffer.__dlpack__()
584584
after = sys.getrefcount(buffer)
585585
# we use the buffer refcount as sentinel for proper clean-up here,
@@ -588,6 +588,23 @@ def test_buffer_dlpack_failure_clean_up():
588588
assert after == before
589589

590590

591+
def test_managed_buffer_dlpack_roundtrip_device_type():
592+
"""Verify that a managed Buffer round-trips through DLPack with kDLCUDAManaged."""
593+
device = Device()
594+
device.set_current()
595+
skip_if_managed_memory_unsupported(device)
596+
mr = DummyUnifiedMemoryResource(device)
597+
buf = mr.allocate(size=1024)
598+
599+
# Buffer-level classification should report managed.
600+
assert buf.__dlpack_device__() == (DLDeviceType.kDLCUDAManaged, 0)
601+
602+
# The end-to-end path: Buffer -> DLPack capsule -> StridedMemoryView
603+
# must preserve kDLCUDAManaged rather than downgrading to kDLCUDAHost.
604+
view = StridedMemoryView.from_any_interface(buf, stream_ptr=-1)
605+
assert view.__dlpack_device__() == (int(DLDeviceType.kDLCUDAManaged), 0)
606+
607+
591608
@pytest.mark.parametrize("use_device_object", [True, False])
592609
def test_device_memory_resource_initialization(use_device_object):
593610
"""Test that DeviceMemoryResource can be initialized successfully.

0 commit comments

Comments
 (0)