diff --git a/NOTICE b/NOTICE index 7018a3bab2..809f20f16f 100644 --- a/NOTICE +++ b/NOTICE @@ -3,3 +3,8 @@ Copyright 2009-2026 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (https://www.apache.org/). + +This product includes software developed by +Advanced Micro Devices, Inc. (https://www.amd.com/). +Copyright (c) 2026 Advanced Micro Devices, Inc. +The AMD/HIP GPU build of the QDP module is an AMD contribution. diff --git a/qdp/DEVELOPMENT.md b/qdp/DEVELOPMENT.md index d6c3e6ee98..8cfe2cd077 100644 --- a/qdp/DEVELOPMENT.md +++ b/qdp/DEVELOPMENT.md @@ -114,6 +114,46 @@ cd .. The first command is what `maturin develop --release` runs on CI; the second verifies tests type-check in the CUDA build. +### AMD GPU build (ROCm / HIP) + +The native engine also builds for AMD GPUs by compiling the same six `.cu` +kernels with `hipcc` and binding the AMD HIP runtime instead of CUDA. This is +opt-in behind the `hip` Cargo feature; the default build is the unchanged CUDA +path, so nothing here affects an NVIDIA build. + +Prerequisites: + +- Linux or Windows + an AMD GPU (CDNA gfx90a or RDNA gfx11xx/gfx12xx) +- ROCm >= 6.0 with `hipcc` and the AMD HIP runtime (`amdhip64`); on Windows a + TheRock-based ROCm from the `rocm-sdk` pip wheels also works. ROCm 6.0 is the + floor because the device-pointer check in `qdp-core/src/gpu/cuda_ffi.rs` uses + the ROCm 6+ `hipMemoryType` device convention (ROCm 5.x numbered Device=1 and + would reject valid device pointers); tested on ROCm 7.2.1 +- a ROCm build of PyTorch in the venv for the DLPack interop tests + +Build the Rust core and kernels for AMD. `QDP_USE_HIP=1` selects the HIP branch +in `build.rs`, and `QDP_HIP_ARCH_LIST` picks the target arch(es) (defaults to +`gfx90a` only when unset; set it to your GPU, e.g. `gfx1100`): + +```bash +cd qdp +export QDP_USE_HIP=1 QDP_HIP_ARCH_LIST=gfx90a ROCM_PATH=/opt/rocm +cargo build -p qdp-core -p qdp-kernels --no-default-features --features hip +cargo test -p qdp-core -p qdp-kernels --no-default-features --features hip -- --test-threads=1 +cd .. +``` + +Build the Python extension with the `hip` feature. Use `--profile dev` (the +release `lto = "fat"` profile produces a bitcode-only cdylib under the HIP +toolchain), and install only the extension so a working ROCm PyTorch in the venv +is not replaced: + +```bash +maturin build --features hip --profile dev \ + --manifest-path qdp/qdp-python/Cargo.toml --out dist/ +pip install --no-deps --force-reinstall dist/qumat_qdp-*.whl +``` + ## 4. Benchmarks From the repo root, set up and prepare benchmarks: diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml index c09f6339e1..21051f1ffb 100644 --- a/qdp/qdp-core/Cargo.toml +++ b/qdp/qdp-core/Cargo.toml @@ -4,8 +4,8 @@ version.workspace = true edition.workspace = true [dependencies] -cudarc = { workspace = true } -qdp-kernels = { path = "../qdp-kernels" } +cudarc = { workspace = true, optional = true } +qdp-kernels = { path = "../qdp-kernels", default-features = false } thiserror = { workspace = true } rayon = { workspace = true } nvtx = { version = "1.3", optional = true } @@ -30,7 +30,14 @@ protoc-bin-vendored = { workspace = true } name = "qdp_core" [features] -default = [] +# `cuda` and `hip` are mutually exclusive; pick exactly one vendor backend. +# If both end up enabled (e.g. via workspace feature unification), `hip` takes +# precedence: kernels build for HIP and cudarc is compiled but unused. +# NVIDIA CUDA via cudarc + nvcc kernels (default, unchanged behavior). +default = ["cuda"] +cuda = ["dep:cudarc", "qdp-kernels/cuda"] +# AMD HIP: hipcc kernels + the qdp-kernels device shim; no cudarc. +hip = ["qdp-kernels/hip"] observability = ["nvtx"] pytorch = ["tch"] remote-io = ["object_store", "tokio", "tempfile", "futures"] diff --git a/qdp/qdp-core/build.rs b/qdp/qdp-core/build.rs index 311ea139dd..e416fe8257 100644 --- a/qdp/qdp-core/build.rs +++ b/qdp/qdp-core/build.rs @@ -15,6 +15,19 @@ // limitations under the License. fn main() { + // Emit qdp_gpu_platform cfg on any OS where the GPU stack is compiled. + // Linux always has it (original target). Windows gets it when the `hip` + // feature is active (TheRock-based ROCm; the feature is set by QDP_USE_HIP=1). + // Source code that was `#[cfg(target_os = "linux")]` should use + // `#[cfg(qdp_gpu_platform)]` so it compiles on both. + println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)"); + let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux"); + let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok(); + let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows"); + if is_linux || (is_windows && hip_feature) { + println!("cargo::rustc-cfg=qdp_gpu_platform"); + } + // Use vendored protoc to avoid missing protoc in CI/dev environments unsafe { std::env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap()); diff --git a/qdp/qdp-core/src/dlpack.rs b/qdp/qdp-core/src/dlpack.rs index a15344ee89..cea99b68a5 100644 --- a/qdp/qdp-core/src/dlpack.rs +++ b/qdp/qdp-core/src/dlpack.rs @@ -16,14 +16,14 @@ // DLPack protocol for zero-copy GPU memory sharing with PyTorch -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; use crate::error::{MahoutError, Result}; use crate::gpu::memory::{BufferStorage, GpuDeviceType, GpuStateVector, Precision}; use std::os::raw::{c_int, c_void}; use std::sync::Arc; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::cuda_ffi::{ CUDA_EVENT_DISABLE_TIMING, cudaEventCreateWithFlags, cudaEventDestroy, cudaEventRecord, cudaStreamWaitEvent, @@ -45,7 +45,7 @@ pub fn dlpack_stream_to_cuda(stream: i64) -> *mut c_void { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] /// # Safety /// `stream` must be a valid CUDA stream pointer or one of the CUDA sentinel /// values (legacy/per-thread default). Passing any other pointer is undefined. @@ -96,7 +96,7 @@ pub unsafe fn synchronize_stream(stream: *mut c_void) -> Result<()> { Ok(()) } -#[cfg(not(target_os = "linux"))] +#[cfg(not(qdp_gpu_platform))] /// # Safety /// No-op on non-Linux targets, kept unsafe to match the Linux signature. pub unsafe fn synchronize_stream(_stream: *mut c_void) -> Result<()> { diff --git a/qdp/qdp-core/src/encoding/amplitude.rs b/qdp/qdp-core/src/encoding/amplitude.rs index fe51140e2d..fada74770d 100644 --- a/qdp/qdp-core/src/encoding/amplitude.rs +++ b/qdp/qdp-core/src/encoding/amplitude.rs @@ -22,7 +22,7 @@ use std::ffi::c_void; -use cudarc::driver::{CudaSlice, DevicePtrMut}; +use crate::gpu_rt::{CudaSlice, DevicePtrMut}; use qdp_kernels::{launch_amplitude_encode_batch, launch_l2_norm_batch}; use super::{ChunkEncoder, STAGE_SIZE_ELEMENTS}; @@ -135,7 +135,7 @@ impl ChunkEncoder for AmplitudeEncoder { mod tests { use super::*; use crate::MahoutError; - use cudarc::driver::DeviceSlice; + use crate::gpu_rt::DeviceSlice; #[test] fn reject_sample_size_zero() { diff --git a/qdp/qdp-core/src/encoding/angle.rs b/qdp/qdp-core/src/encoding/angle.rs index b2141d104e..7cd8c371a9 100644 --- a/qdp/qdp-core/src/encoding/angle.rs +++ b/qdp/qdp-core/src/encoding/angle.rs @@ -130,9 +130,9 @@ mod tests { use crate::encoding::STAGE_SIZE_ELEMENTS; // chunk-size overflow checks #[test] - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn test_encode_chunk_overflow() { - use cudarc::driver::CudaDevice; + use crate::gpu_rt::CudaDevice; use std::sync::Arc; let device: Arc = match CudaDevice::new(0) { diff --git a/qdp/qdp-core/src/encoding/basis.rs b/qdp/qdp-core/src/encoding/basis.rs index 801e077688..8bc2f19baf 100644 --- a/qdp/qdp-core/src/encoding/basis.rs +++ b/qdp/qdp-core/src/encoding/basis.rs @@ -22,7 +22,7 @@ use std::ffi::c_void; -use cudarc::driver::{CudaSlice, DevicePtr}; +use crate::gpu_rt::{CudaSlice, DevicePtr}; use qdp_kernels::launch_basis_encode_batch; use super::{ChunkEncoder, STAGE_SIZE_ELEMENTS}; diff --git a/qdp/qdp-core/src/encoding/mod.rs b/qdp/qdp-core/src/encoding/mod.rs index 2d09b51460..fab09ddc70 100644 --- a/qdp/qdp-core/src/encoding/mod.rs +++ b/qdp/qdp-core/src/encoding/mod.rs @@ -25,7 +25,7 @@ use std::sync::Arc; use std::sync::mpsc::{Receiver, SyncSender, sync_channel}; use std::thread::{self, JoinHandle}; -use cudarc::driver::{CudaDevice, DevicePtr}; +use crate::gpu_rt::{CudaDevice, DevicePtr}; /// Guard that ensures GPU synchronization and IO thread cleanup on drop. /// Used to handle early returns in `stream_encode`. diff --git a/qdp/qdp-core/src/error.rs b/qdp/qdp-core/src/error.rs index 09392bf552..3a03b82690 100644 --- a/qdp/qdp-core/src/error.rs +++ b/qdp/qdp-core/src/error.rs @@ -53,7 +53,7 @@ pub enum MahoutError { pub type Result = std::result::Result; /// Convert CUDA error code to human-readable string -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub fn cuda_error_to_string(code: i32) -> &'static str { match code { 0 => "cudaSuccess", diff --git a/qdp/qdp-core/src/gpu/buffer_pool.rs b/qdp/qdp-core/src/gpu/buffer_pool.rs index 0b0ec24010..5a337cc215 100644 --- a/qdp/qdp-core/src/gpu/buffer_pool.rs +++ b/qdp/qdp-core/src/gpu/buffer_pool.rs @@ -23,17 +23,17 @@ use std::time::Instant; use crate::error::{MahoutError, Result}; use crate::gpu::memory::PinnedHostBuffer; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::pool_metrics::PoolMetrics; /// Handle that automatically returns a buffer to the pool on drop. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub struct PinnedBufferHandle { buffer: Option>, pool: Arc>, } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl std::ops::Deref for PinnedBufferHandle { type Target = PinnedHostBuffer; @@ -44,7 +44,7 @@ impl std::ops::Deref for PinnedBufferHandle { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl std::ops::DerefMut for PinnedBufferHandle { fn deref_mut(&mut self) -> &mut Self::Target { self.buffer @@ -53,7 +53,7 @@ impl std::ops::DerefMut for PinnedBufferHandle { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl Drop for PinnedBufferHandle { fn drop(&mut self) { if let Some(buf) = self.buffer.take() { @@ -65,7 +65,7 @@ impl Drop for PinnedBufferHandle { } /// Pool of pinned host buffers sized for a fixed batch shape. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub struct PinnedBufferPool { free: Mutex>>, available_cv: Condvar, @@ -73,7 +73,7 @@ pub struct PinnedBufferPool { elements_per_buffer: usize, } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl PinnedBufferPool { /// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` values of `T`. pub fn new(pool_size: usize, elements_per_buffer: usize) -> Result> { diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs index 2ed60c311e..3fd69ef834 100644 --- a/qdp/qdp-core/src/gpu/cuda_ffi.rs +++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs @@ -14,11 +14,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Centralized CUDA Runtime API FFI declarations. +//! Centralized GPU runtime API FFI declarations. +//! +//! These are the runtime entry points the pinned-memory pool, OOM guard, and +//! dual-stream pipeline call directly (outside the cudarc/`gpu_rt` slice). The +//! public function names keep their `cuda*` spelling so call sites are +//! unchanged across vendors. On the default `cuda` feature they bind libcudart +//! directly; on the `hip` feature they are thin wrappers over the matching +//! libamdhip64 entry points (which are 1:1, and whose status codes match +//! CUDA's numerically for the codes used here). use std::ffi::c_void; pub(crate) const CUDA_MEMCPY_HOST_TO_DEVICE: u32 = 1; +#[allow(dead_code)] +pub(crate) const CUDA_MEMCPY_DEVICE_TO_HOST: u32 = 2; pub(crate) const CUDA_EVENT_DISABLE_TIMING: u32 = 0x02; pub(crate) const CUDA_EVENT_DEFAULT: u32 = 0x00; #[allow(dead_code)] @@ -37,65 +47,247 @@ pub(crate) struct CudaPointerAttributes { pub allocation_flags: u32, } -// CUDA error codes +// CUDA/HIP error codes (numerically identical for the codes used). pub(crate) const CUDA_SUCCESS: i32 = 0; -// Note: CUDA_ERROR_NOT_READY may be used in future optimizations for non-blocking event checks -// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0028 #[allow(dead_code)] pub(crate) const CUDA_ERROR_NOT_READY: i32 = 34; -unsafe extern "C" { - pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: u32) -> i32; - pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32; +// ---- CUDA backend: bind libcudart directly ---- +#[cfg(all(feature = "cuda", not(feature = "hip")))] +pub(crate) use cuda_rt::*; + +#[cfg(all(feature = "cuda", not(feature = "hip")))] +mod cuda_rt { + use super::CudaPointerAttributes; + use std::ffi::c_void; + + unsafe extern "C" { + pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: u32) -> i32; + pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32; + + #[allow(dead_code)] + pub(crate) fn cudaPointerGetAttributes( + attributes: *mut CudaPointerAttributes, + ptr: *const c_void, + ) -> i32; + + pub(crate) fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32; + + pub(crate) fn cudaMemcpyAsync( + dst: *mut c_void, + src: *const c_void, + count: usize, + kind: u32, + stream: *mut c_void, + ) -> i32; + + #[allow(dead_code)] + pub(crate) fn cudaMemcpy( + dst: *mut c_void, + src: *const c_void, + count: usize, + kind: u32, + ) -> i32; + + pub(crate) fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32; + pub(crate) fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32; + pub(crate) fn cudaEventDestroy(event: *mut c_void) -> i32; + pub(crate) fn cudaStreamWaitEvent( + stream: *mut c_void, + event: *mut c_void, + flags: u32, + ) -> i32; + pub(crate) fn cudaStreamSynchronize(stream: *mut c_void) -> i32; + + pub(crate) fn cudaMemsetAsync( + devPtr: *mut c_void, + value: i32, + count: usize, + stream: *mut c_void, + ) -> i32; + + #[allow(dead_code)] + pub(crate) fn cudaEventQuery(event: *mut c_void) -> i32; + pub(crate) fn cudaEventSynchronize(event: *mut c_void) -> i32; + pub(crate) fn cudaEventElapsedTime( + ms: *mut f32, + start: *mut c_void, + end: *mut c_void, + ) -> i32; + } +} + +// ---- HIP backend: bind libamdhip64, expose the same cuda* names ---- +#[cfg(feature = "hip")] +pub(crate) use hip_rt::*; + +// The wrapper functions deliberately keep the cuda* spelling so call sites are +// vendor-agnostic; suppress the snake_case lint for that intentional naming. +#[cfg(feature = "hip")] +#[allow(non_snake_case)] +mod hip_rt { + use super::{CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes}; + use std::ffi::c_void; + + // hipMemoryType enum values are NOT guaranteed equal to CUDA's across ROCm + // releases (older HIP used Host=0/Device=1; the hip_runtime_api.h note flags + // this explicitly). So we read the real hipPointerAttribute_t and compare its + // `type` against the named hipMemoryType* constants rather than a magic + // number, then translate to the CUDA convention the caller expects. + const HIP_MEMORY_TYPE_DEVICE: i32 = 2; // hipMemoryTypeDevice + const HIP_MEMORY_TYPE_MANAGED: i32 = 3; // hipMemoryTypeManaged + + // Mirror of hipPointerAttribute_t (ROCm hip_runtime_api.h): the leading + // `type` field is the hipMemoryType enum read by cudaPointerGetAttributes. + #[repr(C)] + struct HipPointerAttributes { + memory_type: i32, + device: i32, + device_pointer: *mut c_void, + host_pointer: *mut c_void, + is_managed: i32, + allocation_flags: u32, + } + + unsafe extern "C" { + fn hipHostMalloc(ptr: *mut *mut c_void, size: usize, flags: u32) -> i32; + fn hipHostFree(ptr: *mut c_void) -> i32; + fn hipPointerGetAttributes(attributes: *mut c_void, ptr: *const c_void) -> i32; + fn hipMemGetInfo(free: *mut usize, total: *mut usize) -> i32; + fn hipMemcpyAsync( + dst: *mut c_void, + src: *const c_void, + count: usize, + kind: u32, + stream: *mut c_void, + ) -> i32; + fn hipMemcpy(dst: *mut c_void, src: *const c_void, count: usize, kind: u32) -> i32; + fn hipEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32; + fn hipEventRecord(event: *mut c_void, stream: *mut c_void) -> i32; + fn hipEventDestroy(event: *mut c_void) -> i32; + fn hipStreamWaitEvent(stream: *mut c_void, event: *mut c_void, flags: u32) -> i32; + fn hipStreamSynchronize(stream: *mut c_void) -> i32; + fn hipMemsetAsync(dst: *mut c_void, value: i32, count: usize, stream: *mut c_void) -> i32; + fn hipEventQuery(event: *mut c_void) -> i32; + fn hipEventSynchronize(event: *mut c_void) -> i32; + fn hipEventElapsedTime(ms: *mut f32, start: *mut c_void, end: *mut c_void) -> i32; + } + + // hipHostMallocDefault == 0, matching cudaHostAllocDefault used by callers. + pub(crate) unsafe fn cudaHostAlloc(p: *mut *mut c_void, size: usize, flags: u32) -> i32 { + unsafe { hipHostMalloc(p, size, flags) } + } + pub(crate) unsafe fn cudaFreeHost(ptr: *mut c_void) -> i32 { + unsafe { hipHostFree(ptr) } + } #[allow(dead_code)] - pub(crate) fn cudaPointerGetAttributes( + pub(crate) unsafe fn cudaPointerGetAttributes( attributes: *mut CudaPointerAttributes, ptr: *const c_void, - ) -> i32; + ) -> i32 { + let mut hip_attrs = HipPointerAttributes { + memory_type: 0, + device: 0, + device_pointer: std::ptr::null_mut(), + host_pointer: std::ptr::null_mut(), + is_managed: 0, + allocation_flags: 0, + }; + let ret = unsafe { hipPointerGetAttributes(&mut hip_attrs as *mut _ as *mut c_void, ptr) }; + if ret != 0 { + return ret; + } + // Translate the hipMemoryType enum to the CUDA convention the caller + // checks against, comparing the named hipMemoryType* values explicitly + // (do not assume the numeric enum equals CUDA's). Anything else stays + // verbatim so the caller's "not device memory" branch still fires. + let memory_type = match hip_attrs.memory_type { + HIP_MEMORY_TYPE_DEVICE => CUDA_MEMORY_TYPE_DEVICE, + HIP_MEMORY_TYPE_MANAGED => CUDA_MEMORY_TYPE_MANAGED, + other => other, + }; + unsafe { + *attributes = CudaPointerAttributes { + memory_type, + device: hip_attrs.device, + device_pointer: hip_attrs.device_pointer, + host_pointer: hip_attrs.host_pointer, + is_managed: hip_attrs.is_managed, + allocation_flags: hip_attrs.allocation_flags, + }; + } + 0 + } - pub(crate) fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32; + pub(crate) unsafe fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32 { + unsafe { hipMemGetInfo(free, total) } + } - pub(crate) fn cudaMemcpyAsync( + // hipMemcpyAsync is the exact 1:1 of cudaMemcpyAsync: it enqueues on the + // stream and returns without blocking the host, preserving the dual-stream + // H2D/compute overlap. (hipMemcpyWithStream would synchronize the stream + // before returning, serializing the pipeline.) + pub(crate) unsafe fn cudaMemcpyAsync( dst: *mut c_void, src: *const c_void, count: usize, kind: u32, stream: *mut c_void, - ) -> i32; + ) -> i32 { + unsafe { hipMemcpyAsync(dst, src, count, kind, stream) } + } - pub(crate) fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32; - pub(crate) fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32; - pub(crate) fn cudaEventDestroy(event: *mut c_void) -> i32; - pub(crate) fn cudaStreamWaitEvent(stream: *mut c_void, event: *mut c_void, flags: u32) -> i32; - pub(crate) fn cudaStreamSynchronize(stream: *mut c_void) -> i32; + #[allow(dead_code)] + pub(crate) unsafe fn cudaMemcpy( + dst: *mut c_void, + src: *const c_void, + count: usize, + kind: u32, + ) -> i32 { + unsafe { hipMemcpy(dst, src, count, kind) } + } - pub(crate) fn cudaMemsetAsync( - devPtr: *mut c_void, + pub(crate) unsafe fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32 { + unsafe { hipEventCreateWithFlags(event, flags) } + } + pub(crate) unsafe fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32 { + unsafe { hipEventRecord(event, stream) } + } + pub(crate) unsafe fn cudaEventDestroy(event: *mut c_void) -> i32 { + unsafe { hipEventDestroy(event) } + } + pub(crate) unsafe fn cudaStreamWaitEvent( + stream: *mut c_void, + event: *mut c_void, + flags: u32, + ) -> i32 { + unsafe { hipStreamWaitEvent(stream, event, flags) } + } + pub(crate) unsafe fn cudaStreamSynchronize(stream: *mut c_void) -> i32 { + unsafe { hipStreamSynchronize(stream) } + } + pub(crate) unsafe fn cudaMemsetAsync( + dev_ptr: *mut c_void, value: i32, count: usize, stream: *mut c_void, - ) -> i32; - - /// Non-blocking event query - /// - /// Returns CUDA_SUCCESS if the event has completed, CUDA_ERROR_NOT_READY if not. - /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html - /// - /// Note: May be used in future optimizations for non-blocking event checks to reduce - /// synchronization overhead in pipeline operations. + ) -> i32 { + unsafe { hipMemsetAsync(dev_ptr, value, count, stream) } + } + #[allow(dead_code)] - pub(crate) fn cudaEventQuery(event: *mut c_void) -> i32; - - /// Blocking event synchronization - /// - /// Waits until the completion of all work currently captured in the event. - /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html - pub(crate) fn cudaEventSynchronize(event: *mut c_void) -> i32; - - /// Calculate elapsed time between two events (in milliseconds) - /// - /// Both events must have been created with CUDA_EVENT_DEFAULT flag. - /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html - pub(crate) fn cudaEventElapsedTime(ms: *mut f32, start: *mut c_void, end: *mut c_void) -> i32; + pub(crate) unsafe fn cudaEventQuery(event: *mut c_void) -> i32 { + unsafe { hipEventQuery(event) } + } + pub(crate) unsafe fn cudaEventSynchronize(event: *mut c_void) -> i32 { + unsafe { hipEventSynchronize(event) } + } + pub(crate) unsafe fn cudaEventElapsedTime( + ms: *mut f32, + start: *mut c_void, + end: *mut c_void, + ) -> i32 { + unsafe { hipEventElapsedTime(ms, start, end) } + } } diff --git a/qdp/qdp-core/src/gpu/cuda_sync.rs b/qdp/qdp-core/src/gpu/cuda_sync.rs index 077da077dc..2a33d8f276 100644 --- a/qdp/qdp-core/src/gpu/cuda_sync.rs +++ b/qdp/qdp-core/src/gpu/cuda_sync.rs @@ -16,7 +16,7 @@ //! Shared CUDA stream synchronization with unified error reporting. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; use crate::error::{MahoutError, Result, cuda_error_to_string}; @@ -32,7 +32,7 @@ use crate::error::{MahoutError, Result, cuda_error_to_string}; /// /// # Safety /// The stream pointer must be valid for the duration of this call. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub(crate) fn sync_cuda_stream(stream: *mut c_void, context: &str) -> Result<()> { let ret = unsafe { crate::gpu::cuda_ffi::cudaStreamSynchronize(stream) }; if ret != 0 { @@ -46,7 +46,7 @@ pub(crate) fn sync_cuda_stream(stream: *mut c_void, context: &str) -> Result<()> Ok(()) } -#[cfg(all(test, target_os = "linux"))] +#[cfg(all(test, qdp_gpu_platform))] mod tests { use super::*; use std::ffi::c_void; diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs b/qdp/qdp-core/src/gpu/encodings/amplitude.rs index 7cf70d9ec5..798498b4a7 100644 --- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs +++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs @@ -23,27 +23,27 @@ use std::sync::Arc; use super::QuantumEncoder; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; use crate::error::{MahoutError, Result}; use crate::gpu::memory::{GpuStateVector, Precision}; use crate::gpu::pipeline::run_dual_stream_pipeline; -use cudarc::driver::CudaDevice; +use crate::gpu_rt::CudaDevice; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::cuda_ffi::cudaMemsetAsync; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::cuda_sync::sync_cuda_stream; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error}; -#[cfg(target_os = "linux")] -use cudarc::driver::{DevicePtr, DevicePtrMut}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use crate::gpu_rt::{DevicePtr, DevicePtrMut}; +#[cfg(qdp_gpu_platform)] use qdp_kernels::{ launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_batch_f32, launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, launch_l2_norm_f32, }; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; use crate::preprocessing::Preprocessor; @@ -65,7 +65,7 @@ impl QuantumEncoder for AmplitudeEncoder { Preprocessor::validate_input(host_data, num_qubits)?; let state_len = 1 << num_qubits; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { // Allocate GPU state vector let state_vector = { @@ -178,7 +178,7 @@ impl QuantumEncoder for AmplitudeEncoder { Ok(state_vector) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "CUDA unavailable (non-Linux stub)".to_string(), @@ -187,7 +187,7 @@ impl QuantumEncoder for AmplitudeEncoder { } /// Encode multiple samples in a single GPU allocation and kernel launch - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch( &self, device: &Arc, @@ -299,7 +299,7 @@ impl QuantumEncoder for AmplitudeEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr( &self, device: &Arc, @@ -361,7 +361,7 @@ impl QuantumEncoder for AmplitudeEncoder { Ok(state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr( &self, device: &Arc, @@ -390,7 +390,7 @@ impl QuantumEncoder for AmplitudeEncoder { }; let inv_norms_gpu = { crate::profile_scope!("GPU::BatchNormKernel"); - use cudarc::driver::DevicePtrMut; + use crate::gpu_rt::DevicePtrMut; let mut buffer = device.alloc_zeros::(num_samples).map_err(|e| { MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e)) })?; @@ -414,6 +414,12 @@ impl QuantumEncoder for AmplitudeEncoder { }; { crate::profile_scope!("GPU::NormValidation"); + // The norm kernel ran on the caller's stream, but dtoh_sync_copy reads + // back on the default stream. Synchronize the caller's stream first so + // the result is visible: with a non-blocking stream (which does not + // implicitly order against the default stream) the readback would + // otherwise race and observe the zero-initialized buffer. + sync_cuda_stream(stream, "Norm stream synchronize failed (batch)")?; let host_inv_norms = device .dtoh_sync_copy(&inv_norms_gpu) .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?; @@ -425,7 +431,7 @@ impl QuantumEncoder for AmplitudeEncoder { } { crate::profile_scope!("GPU::BatchKernelLaunch"); - use cudarc::driver::DevicePtr; + use crate::gpu_rt::DevicePtr; let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| { MahoutError::InvalidInput( "Batch state vector precision mismatch (expected float64 buffer)".to_string(), @@ -458,7 +464,7 @@ impl QuantumEncoder for AmplitudeEncoder { } /// Encode multiple samples in a single GPU allocation and kernel launch for f32 inputs - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch_f32( &self, device: &Arc, @@ -514,7 +520,7 @@ impl QuantumEncoder for AmplitudeEncoder { // Compute inverse norms on GPU using warp-reduced kernel let inv_norms_gpu = { crate::profile_scope!("GPU::BatchNormKernel_f32"); - use cudarc::driver::DevicePtrMut; + use crate::gpu_rt::DevicePtrMut; let mut buffer = device.alloc_zeros::(num_samples).map_err(|e| { MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e)) })?; @@ -556,7 +562,7 @@ impl QuantumEncoder for AmplitudeEncoder { // Launch batch kernel { crate::profile_scope!("GPU::BatchKernelLaunch_f32"); - use cudarc::driver::DevicePtr; + use crate::gpu_rt::DevicePtr; let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| { MahoutError::InvalidInput( "Batch state vector precision mismatch (expected float32 buffer)".to_string(), @@ -593,7 +599,7 @@ impl QuantumEncoder for AmplitudeEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr_f32( &self, device: &Arc, @@ -622,7 +628,7 @@ impl QuantumEncoder for AmplitudeEncoder { }; let inv_norms_gpu = { crate::profile_scope!("GPU::BatchNormKernel_f32"); - use cudarc::driver::DevicePtrMut; + use crate::gpu_rt::DevicePtrMut; let mut buffer = device.alloc_zeros::(num_samples).map_err(|e| { MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e)) })?; @@ -646,6 +652,12 @@ impl QuantumEncoder for AmplitudeEncoder { }; { crate::profile_scope!("GPU::NormValidation_f32"); + // The norm kernel ran on the caller's stream, but dtoh_sync_copy reads + // back on the default stream. Synchronize the caller's stream first so + // the result is visible: with a non-blocking stream (which does not + // implicitly order against the default stream) the readback would + // otherwise race and observe the zero-initialized buffer. + sync_cuda_stream(stream, "Norm stream synchronize failed (batch f32)")?; let host_inv_norms = device .dtoh_sync_copy(&inv_norms_gpu) .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?; @@ -657,7 +669,7 @@ impl QuantumEncoder for AmplitudeEncoder { } { crate::profile_scope!("GPU::BatchKernelLaunch_f32"); - use cudarc::driver::DevicePtr; + use crate::gpu_rt::DevicePtr; let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| { MahoutError::InvalidInput( "Batch state vector precision mismatch (expected float32 buffer)".to_string(), @@ -689,7 +701,7 @@ impl QuantumEncoder for AmplitudeEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr_f32( &self, device: &Arc, @@ -726,7 +738,7 @@ impl AmplitudeEncoder { /// data transfer and computation. The pipeline handles all the /// streaming mechanics, while this method focuses on the amplitude /// encoding kernel logic. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub(crate) fn encode_async_pipeline( device: &Arc, host_data: &[f64], @@ -845,7 +857,7 @@ impl AmplitudeEncoder { /// Caller must ensure `input_d` points to at least `input_len` `f32` values in /// GPU-accessible memory on the same device as `device`, and `stream` is either /// null or a valid CUDA stream associated with `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr_f32_with_stream( device: &Arc, input_d: *const f32, @@ -919,7 +931,7 @@ impl AmplitudeEncoder { /// # Safety /// The caller must ensure `input_batch_d` points to valid GPU memory containing /// at least `num_samples * sample_size` f32 elements on the same device as `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream( device: &Arc, input_batch_d: *const f32, @@ -951,7 +963,7 @@ impl AmplitudeEncoder { let inv_norms_gpu = { crate::profile_scope!("GPU::BatchNormKernelF32"); - use cudarc::driver::DevicePtrMut; + use crate::gpu_rt::DevicePtrMut; let mut buffer = device.alloc_zeros::(num_samples).map_err(|e| { MahoutError::MemoryAllocation(format!( @@ -980,6 +992,13 @@ impl AmplitudeEncoder { { crate::profile_scope!("GPU::NormValidationF32"); + // The norm kernel ran on the caller's stream, but dtoh_sync_copy + // reads back on the default stream. Synchronize the caller's stream + // first so the result is visible: with a non-blocking stream (which + // does not implicitly order against the default stream) the readback + // would otherwise race and observe the zero-initialized buffer. This + // mirrors the single-sample path (calculate_inv_norm_gpu_with_stream). + sync_cuda_stream(stream, "Norm stream synchronize failed (batch f32)")?; let host_inv_norms = device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| { MahoutError::Cuda(format!("Failed to copy f32 norms to host: {:?}", e)) })?; @@ -992,7 +1011,7 @@ impl AmplitudeEncoder { { crate::profile_scope!("GPU::BatchKernelLaunchF32"); - use cudarc::driver::DevicePtr; + use crate::gpu_rt::DevicePtr; let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| { MahoutError::InvalidInput( @@ -1040,7 +1059,7 @@ impl AmplitudeEncoder { /// # Safety /// The caller must ensure `input_ptr` points to valid GPU memory containing /// at least `len` f64 elements on the same device as `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub(crate) unsafe fn calculate_inv_norm_gpu( device: &Arc, input_ptr: *const f64, @@ -1051,7 +1070,7 @@ impl AmplitudeEncoder { } } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub(crate) unsafe fn calculate_inv_norm_gpu_with_stream( device: &Arc, input_ptr: *const f64, @@ -1111,7 +1130,7 @@ impl AmplitudeEncoder { /// # Safety /// The caller must ensure `input_ptr` points to valid GPU memory containing /// at least `len` f32 elements on the same device as `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn calculate_inv_norm_gpu_f32( device: &Arc, input_ptr: *const f32, @@ -1132,7 +1151,7 @@ impl AmplitudeEncoder { /// # Safety /// The caller must ensure `input_ptr` points to valid GPU memory containing /// at least `len` f32 elements on the same device as `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn calculate_inv_norm_gpu_f32_with_stream( device: &Arc, input_ptr: *const f32, @@ -1180,7 +1199,7 @@ impl AmplitudeEncoder { } /// Run dual-stream pipeline for amplitude encoding (exposed for Python / benchmark). - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub(crate) fn run_amplitude_dual_stream_pipeline( device: &Arc, host_data: &[f64], diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs index 36e784e035..313489d916 100644 --- a/qdp/qdp-core/src/gpu/encodings/angle.rs +++ b/qdp/qdp-core/src/gpu/encodings/angle.rs @@ -21,20 +21,20 @@ #![allow(unused_unsafe)] use super::{QuantumEncoder, validate_qubit_count}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; use crate::error::{MahoutError, Result}; use crate::gpu::memory::{GpuStateVector, Precision}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::pipeline::run_dual_stream_pipeline_aligned; -use cudarc::driver::CudaDevice; +use crate::gpu_rt::CudaDevice; use std::sync::Arc; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::memory::map_allocation_error; -#[cfg(target_os = "linux")] -use cudarc::driver::DevicePtr; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use crate::gpu_rt::DevicePtr; +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; /// Angle encoding: each qubit uses one rotation angle to form a product state. @@ -43,15 +43,15 @@ pub struct AngleEncoder; impl QuantumEncoder for AngleEncoder { fn encode( &self, - #[cfg(target_os = "linux")] device: &Arc, - #[cfg(not(target_os = "linux"))] _device: &Arc, + #[cfg(qdp_gpu_platform)] device: &Arc, + #[cfg(not(qdp_gpu_platform))] _device: &Arc, data: &[f64], num_qubits: usize, ) -> Result { self.validate_input(data, num_qubits)?; let state_len = 1 << num_qubits; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { let input_bytes = std::mem::size_of_val(data); let angles_gpu = { @@ -103,7 +103,7 @@ impl QuantumEncoder for AngleEncoder { Ok(state_vector) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "CUDA unavailable (non-Linux stub)".to_string(), @@ -112,7 +112,7 @@ impl QuantumEncoder for AngleEncoder { } /// Encode multiple angle samples in a single GPU allocation and kernel launch - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch( &self, device: &Arc, @@ -229,7 +229,7 @@ impl QuantumEncoder for AngleEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr( &self, device: &Arc, @@ -294,7 +294,7 @@ impl QuantumEncoder for AngleEncoder { Ok(state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr( &self, device: &Arc, @@ -374,7 +374,7 @@ impl QuantumEncoder for AngleEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch_f32( &self, device: &Arc, @@ -494,7 +494,7 @@ impl QuantumEncoder for AngleEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr_f32( &self, device: &Arc, @@ -595,7 +595,7 @@ impl QuantumEncoder for AngleEncoder { Ok(()) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr_f32( &self, device: &Arc, @@ -636,7 +636,7 @@ impl AngleEncoder { /// The caller must also ensure that `stream` is either null or a valid CUDA stream handle /// associated with `device`, and that no concurrent use of these raw pointers violates Rust's /// aliasing or lifetime rules. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr_f32_with_stream( device: &Arc, input_d: *const f32, @@ -717,7 +717,7 @@ impl AngleEncoder { /// valid for the duration of this call. The caller must also ensure that `stream` is either /// null or a valid CUDA stream handle associated with `device`, and that the memory layout is /// row-major with exactly `sample_size` angles per sample. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream( device: &Arc, input_batch_d: *const f32, @@ -738,7 +738,7 @@ impl AngleEncoder { } } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch_async_pipeline( device: &Arc, batch_data: &[f64], @@ -802,7 +802,7 @@ impl AngleEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch_async_pipeline_f32( device: &Arc, batch_data: &[f32], diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs b/qdp/qdp-core/src/gpu/encodings/basis.rs index 4f2cec15e6..16e09d735a 100644 --- a/qdp/qdp-core/src/gpu/encodings/basis.rs +++ b/qdp/qdp-core/src/gpu/encodings/basis.rs @@ -21,18 +21,18 @@ #![allow(unused_unsafe)] use super::{QuantumEncoder, validate_qubit_count}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; use crate::error::{MahoutError, Result}; use crate::gpu::memory::{GpuStateVector, Precision}; -use cudarc::driver::CudaDevice; +use crate::gpu_rt::CudaDevice; use std::sync::Arc; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::memory::map_allocation_error; -#[cfg(target_os = "linux")] -use cudarc::driver::DevicePtr; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use crate::gpu_rt::DevicePtr; +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; /// Basis encoding: maps an integer index to a computational basis state. @@ -51,8 +51,8 @@ pub struct BasisEncoder; impl QuantumEncoder for BasisEncoder { fn encode( &self, - #[cfg(target_os = "linux")] device: &Arc, - #[cfg(not(target_os = "linux"))] _device: &Arc, + #[cfg(qdp_gpu_platform)] device: &Arc, + #[cfg(not(qdp_gpu_platform))] _device: &Arc, data: &[f64], num_qubits: usize, ) -> Result { @@ -69,7 +69,7 @@ impl QuantumEncoder for BasisEncoder { let state_len = 1 << num_qubits; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { // Convert and validate the basis index let basis_index = Self::validate_basis_index(data[0], state_len)?; @@ -116,7 +116,7 @@ impl QuantumEncoder for BasisEncoder { Ok(state_vector) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "CUDA unavailable (non-Linux stub)".to_string(), @@ -125,7 +125,7 @@ impl QuantumEncoder for BasisEncoder { } /// Encode multiple basis indices in a single GPU allocation and kernel launch - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch( &self, device: &Arc, @@ -225,7 +225,7 @@ impl QuantumEncoder for BasisEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr( &self, device: &Arc, @@ -291,7 +291,7 @@ impl QuantumEncoder for BasisEncoder { Ok(state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr( &self, device: &Arc, @@ -374,7 +374,7 @@ impl QuantumEncoder for BasisEncoder { Ok(()) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch_f32( &self, device: &Arc, @@ -466,7 +466,7 @@ impl QuantumEncoder for BasisEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr_f32( &self, device: &Arc, @@ -541,7 +541,7 @@ impl QuantumEncoder for BasisEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr_f32( &self, device: &Arc, @@ -584,7 +584,7 @@ impl BasisEncoder { /// contiguous `f32` values in GPU-accessible memory and remains valid for the /// duration of this call. `stream` must be either null or a valid CUDA stream /// handle associated with `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream( device: &Arc, input_batch_d: *const f32, @@ -613,7 +613,7 @@ impl BasisEncoder { /// `input_d` must point to one valid `f32` in GPU-accessible memory on `device`, /// remain valid for the duration of this call, and `stream` must be either null /// or a valid CUDA stream handle associated with `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr_f32_with_stream( device: &Arc, input_d: *const f32, diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs b/qdp/qdp-core/src/gpu/encodings/iqp.rs index 33d18cfaf0..ce358724c7 100644 --- a/qdp/qdp-core/src/gpu/encodings/iqp.rs +++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs @@ -17,19 +17,19 @@ // IQP (Instantaneous Quantum Polynomial) encoding: entangled quantum states via diagonal phases. use super::{QuantumEncoder, validate_qubit_count}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; use crate::error::{MahoutError, Result}; use crate::gpu::memory::{GpuStateVector, Precision}; -use cudarc::driver::CudaDevice; +use crate::gpu_rt::CudaDevice; use std::sync::Arc; use std::sync::OnceLock; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::memory::map_allocation_error; -#[cfg(target_os = "linux")] -use cudarc::driver::DevicePtr; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use crate::gpu_rt::DevicePtr; +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; /// IQP encoding: creates entangled quantum states using diagonal phase gates. @@ -69,15 +69,15 @@ impl IqpEncoder { impl QuantumEncoder for IqpEncoder { fn encode( &self, - #[cfg(target_os = "linux")] device: &Arc, - #[cfg(not(target_os = "linux"))] _device: &Arc, + #[cfg(qdp_gpu_platform)] device: &Arc, + #[cfg(not(qdp_gpu_platform))] _device: &Arc, data: &[f64], num_qubits: usize, ) -> Result { self.validate_input(data, num_qubits)?; let state_len = 1 << num_qubits; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { let input_bytes = std::mem::size_of_val(data); let data_gpu = { @@ -130,7 +130,7 @@ impl QuantumEncoder for IqpEncoder { Ok(state_vector) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "CUDA unavailable (non-Linux stub)".to_string(), @@ -139,7 +139,7 @@ impl QuantumEncoder for IqpEncoder { } /// Encode multiple IQP samples in a single GPU allocation and kernel launch - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch( &self, device: &Arc, @@ -238,7 +238,7 @@ impl QuantumEncoder for IqpEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr( &self, device: &Arc, @@ -302,7 +302,7 @@ impl QuantumEncoder for IqpEncoder { Ok(state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr( &self, device: &Arc, diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs b/qdp/qdp-core/src/gpu/encodings/mod.rs index 8d0fd5b4cd..208da9423b 100644 --- a/qdp/qdp-core/src/gpu/encodings/mod.rs +++ b/qdp/qdp-core/src/gpu/encodings/mod.rs @@ -20,9 +20,9 @@ use std::sync::Arc; use crate::error::{MahoutError, Result}; use crate::gpu::memory::GpuStateVector; +use crate::gpu_rt::CudaDevice; use crate::preprocessing::Preprocessor; -use cudarc::driver::CudaDevice; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; /// Maximum number of qubits supported (16GB GPU memory limit) @@ -98,7 +98,7 @@ pub trait QuantumEncoder: Send + Sync + 'static { /// # Safety /// Caller must ensure `input_d` points to valid GPU memory with at least `input_len` elements /// of the expected dtype on the same device as `device`, and `stream` is a valid CUDA stream or null. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr( &self, _device: &Arc, @@ -119,7 +119,7 @@ pub trait QuantumEncoder: Send + Sync + 'static { /// Caller must ensure `input_batch_d` points to valid GPU memory with at least /// `num_samples * sample_size` elements of the expected dtype on the same device as `device`, /// and `stream` is a valid CUDA stream or null. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr( &self, _device: &Arc, @@ -148,7 +148,7 @@ pub trait QuantumEncoder: Send + Sync + 'static { /// Caller must ensure `input_d` points to valid GPU memory with at least `input_len` /// `f32` elements on the same device as `device`, and `stream` is either null or a valid /// CUDA stream associated with `device`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr_f32( &self, _device: &Arc, @@ -182,7 +182,7 @@ pub trait QuantumEncoder: Send + Sync + 'static { /// /// # Safety /// Caller must ensure `input_batch_d` points to valid GPU memory (f32). - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr_f32( &self, _device: &Arc, diff --git a/qdp/qdp-core/src/gpu/encodings/phase.rs b/qdp/qdp-core/src/gpu/encodings/phase.rs index b329a2429e..bacb4ad0d6 100644 --- a/qdp/qdp-core/src/gpu/encodings/phase.rs +++ b/qdp/qdp-core/src/gpu/encodings/phase.rs @@ -34,20 +34,20 @@ #![allow(unused_unsafe)] use super::{QuantumEncoder, validate_qubit_count}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; use crate::error::{MahoutError, Result}; use crate::gpu::memory::{GpuStateVector, Precision}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::pipeline::run_dual_stream_pipeline_aligned; -use cudarc::driver::CudaDevice; +use crate::gpu_rt::CudaDevice; use std::sync::Arc; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::memory::map_allocation_error; -#[cfg(target_os = "linux")] -use cudarc::driver::DevicePtr; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use crate::gpu_rt::DevicePtr; +#[cfg(qdp_gpu_platform)] use std::ffi::c_void; /// Phase encoding: per-qubit P(φ = x_k) gates applied to |+⟩^⊗N. @@ -66,15 +66,15 @@ pub struct PhaseEncoder; impl QuantumEncoder for PhaseEncoder { fn encode( &self, - #[cfg(target_os = "linux")] device: &Arc, - #[cfg(not(target_os = "linux"))] _device: &Arc, + #[cfg(qdp_gpu_platform)] device: &Arc, + #[cfg(not(qdp_gpu_platform))] _device: &Arc, data: &[f64], num_qubits: usize, ) -> Result { self.validate_input(data, num_qubits)?; let state_len = 1 << num_qubits; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { let input_bytes = std::mem::size_of_val(data); let phases_gpu = { @@ -126,7 +126,7 @@ impl QuantumEncoder for PhaseEncoder { Ok(state_vector) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "CUDA unavailable (non-Linux stub)".to_string(), @@ -135,7 +135,7 @@ impl QuantumEncoder for PhaseEncoder { } /// Encode multiple phase samples in a single GPU allocation and kernel launch. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch( &self, device: &Arc, @@ -241,7 +241,7 @@ impl QuantumEncoder for PhaseEncoder { Ok(batch_state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_from_gpu_ptr( &self, device: &Arc, @@ -297,7 +297,7 @@ impl QuantumEncoder for PhaseEncoder { Ok(state_vector) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] unsafe fn encode_batch_from_gpu_ptr( &self, device: &Arc, @@ -325,7 +325,7 @@ impl QuantumEncoder for PhaseEncoder { // through the norm and is caught on the host side. let phase_validation_buffer = { crate::profile_scope!("GPU::PhaseFiniteCheckBatch"); - use cudarc::driver::DevicePtrMut; + use crate::gpu_rt::DevicePtrMut; let mut buffer = device.alloc_zeros::(num_samples).map_err(|e| { MahoutError::MemoryAllocation(format!( "Failed to allocate phase validation buffer: {:?}", @@ -353,6 +353,15 @@ impl QuantumEncoder for PhaseEncoder { { crate::profile_scope!("GPU::PhaseFiniteValidationHostCopy"); + // The norm probe ran on the caller's stream, but dtoh_sync_copy reads + // back on the default stream. Synchronize the caller's stream first so + // the result is visible: with a non-blocking stream (which does not + // implicitly order against the default stream) the readback would + // otherwise race and observe the zero-initialized buffer. + crate::gpu::cuda_sync::sync_cuda_stream( + stream, + "Phase validation norm stream synchronize failed (batch)", + )?; let host_norms = device .dtoh_sync_copy(&phase_validation_buffer) .map_err(|e| { @@ -437,7 +446,7 @@ impl QuantumEncoder for PhaseEncoder { } impl PhaseEncoder { - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn encode_batch_async_pipeline( device: &Arc, batch_data: &[f64], diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs index f68461a6e2..20afd6c97e 100644 --- a/qdp/qdp-core/src/gpu/memory.rs +++ b/qdp/qdp-core/src/gpu/memory.rs @@ -19,13 +19,13 @@ #![allow(unused_unsafe)] use crate::error::{MahoutError, Result}; -use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr}; +use crate::gpu_rt::{CudaDevice, CudaSlice, DevicePtr}; use qdp_kernels::{CuComplex, CuDoubleComplex}; use std::ffi::c_void; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use std::sync::Arc; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::error::cuda_error_to_string; /// Precision of the GPU state vector. @@ -42,15 +42,23 @@ pub enum GpuDeviceType { Rocm, } -#[cfg(target_os = "linux")] +/// The device type of the backend this build targets. Drives the DLPack +/// `device_type` so exported tensors are tagged kDLCUDA on the NVIDIA build and +/// kDLROCM on the HIP build (a ROCm PyTorch's `from_dlpack` rejects a CUDA tag). +#[cfg(not(feature = "hip"))] +pub(crate) const NATIVE_GPU_DEVICE_TYPE: GpuDeviceType = GpuDeviceType::Cuda; +#[cfg(feature = "hip")] +pub(crate) const NATIVE_GPU_DEVICE_TYPE: GpuDeviceType = GpuDeviceType::Rocm; + +#[cfg(qdp_gpu_platform)] use crate::gpu::cuda_ffi::{cudaFreeHost, cudaHostAlloc, cudaMemGetInfo}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn bytes_to_mib(bytes: usize) -> f64 { bytes as f64 / (1024.0 * 1024.0) } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn query_cuda_mem_info() -> Result<(usize, usize)> { unsafe { let mut free_bytes: usize = 0; @@ -72,7 +80,7 @@ fn query_cuda_mem_info() -> Result<(usize, usize)> { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn build_oom_message( context: &str, requested_bytes: usize, @@ -96,7 +104,7 @@ fn build_oom_message( /// /// Returns a MemoryAllocation error with a helpful message when the request /// exceeds the currently reported free memory. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub(crate) fn ensure_device_memory_available( requested_bytes: usize, context: &str, @@ -118,7 +126,7 @@ pub(crate) fn ensure_device_memory_available( } /// Wraps CUDA allocation errors with an OOM-aware MahoutError. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub(crate) fn map_allocation_error( requested_bytes: usize, context: &str, @@ -231,7 +239,7 @@ impl GpuStateVector { /// Create GPU state vector for n qubits with the given precision. /// Allocates 2^n complex numbers (Float32 = CuComplex, Float64 = CuDoubleComplex). /// Default for most callers: use `Precision::Float64`. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub fn new(_device: &Arc, qubits: usize, precision: Precision) -> Result { let _size_elements: usize = 1usize << qubits; @@ -299,11 +307,11 @@ impl GpuStateVector { size_elements: _size_elements, num_samples: None, device_id: _device.ordinal(), - device_type: GpuDeviceType::Cuda, + device_type: NATIVE_GPU_DEVICE_TYPE, }) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] pub fn new(_device: &Arc, _qubits: usize, _precision: Precision) -> Result { Err(MahoutError::Cuda( "CUDA is only available on Linux. This build does not support GPU operations." @@ -360,7 +368,7 @@ impl GpuStateVector { )) })?; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { let buffer = match precision { Precision::Float32 => { @@ -411,11 +419,11 @@ impl GpuStateVector { size_elements: total_elements, num_samples: Some(num_samples), device_id: _device.ordinal(), - device_type: GpuDeviceType::Cuda, + device_type: NATIVE_GPU_DEVICE_TYPE, }) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "CUDA is only available on Linux. This build does not support GPU operations." @@ -434,7 +442,7 @@ impl GpuStateVector { match (self.precision(), target) { (Precision::Float32, Precision::Float64) => { - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { let requested_bytes = self .size_elements @@ -502,7 +510,7 @@ impl GpuStateVector { }) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "Precision conversion requires CUDA (Linux)".to_string(), @@ -510,7 +518,7 @@ impl GpuStateVector { } } (Precision::Float64, Precision::Float32) => { - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] { let requested_bytes = self .size_elements @@ -578,7 +586,7 @@ impl GpuStateVector { }) } - #[cfg(not(target_os = "linux"))] + #[cfg(not(qdp_gpu_platform))] { Err(MahoutError::Cuda( "Precision conversion requires CUDA (Linux)".to_string(), @@ -597,13 +605,13 @@ impl GpuStateVector { /// Pinned Host Memory Buffer (owned allocation). /// /// Allocates page-locked memory to maximize H2D throughput in streaming IO paths. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub struct PinnedHostBuffer { ptr: *mut T, size_elements: usize, } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl PinnedHostBuffer { /// Allocate pinned memory holding `elements` values of type `T`. pub fn new(elements: usize) -> Result { @@ -658,7 +666,7 @@ impl PinnedHostBuffer { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl Drop for PinnedHostBuffer { fn drop(&mut self) { unsafe { @@ -675,8 +683,8 @@ impl Drop for PinnedHostBuffer { } // Safety: Pinned memory is accessible from any thread -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] unsafe impl Send for PinnedHostBuffer {} -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] unsafe impl Sync for PinnedHostBuffer {} diff --git a/qdp/qdp-core/src/gpu/metrics.rs b/qdp/qdp-core/src/gpu/metrics.rs index a7ce54e312..9482a5c0e9 100644 --- a/qdp/qdp-core/src/gpu/metrics.rs +++ b/qdp/qdp-core/src/gpu/metrics.rs @@ -21,12 +21,12 @@ //! to the host and run on the CPU to produce a single scalar per sample. //! They are intended for **testing and validation**, not the hot path. -#[cfg(target_os = "linux")] -use cudarc::driver::CudaDevice; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use crate::gpu_rt::CudaDevice; +#[cfg(qdp_gpu_platform)] use std::sync::Arc; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use qdp_kernels::{CuComplex, CuDoubleComplex}; use crate::error::{MahoutError, Result}; @@ -152,7 +152,7 @@ pub fn trace_distance_cross_precision(state_f32: &[f32], state_f64: &[f64]) -> R /// Download f64 complex GPU data to host as interleaved (re, im) f64 vec. /// /// `gpu_ptr` must point to `num_elements` `CuDoubleComplex` values on device. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub fn download_complex_f64( device: &Arc, gpu_ptr: *const CuDoubleComplex, @@ -168,15 +168,17 @@ pub fn download_complex_f64( let mut host_buf = vec![0.0_f64; num_elements * 2]; // interleaved re, im unsafe { - let ret = cudarc::driver::sys::lib().cuMemcpyDtoH_v2( + let ret = crate::gpu::cuda_ffi::cudaMemcpy( host_buf.as_mut_ptr() as *mut _, - gpu_ptr as u64, + gpu_ptr as *const _, byte_count, + crate::gpu::cuda_ffi::CUDA_MEMCPY_DEVICE_TO_HOST, ); - if ret != cudarc::driver::sys::CUresult::CUDA_SUCCESS { + if ret != crate::gpu::cuda_ffi::CUDA_SUCCESS { return Err(MahoutError::Cuda(format!( - "cuMemcpyDtoH failed during f64 download: {:?}", - ret + "device-to-host copy failed during f64 download: {} ({})", + ret, + crate::error::cuda_error_to_string(ret) ))); } } @@ -185,7 +187,7 @@ pub fn download_complex_f64( } /// Download f32 complex GPU data to host as interleaved (re, im) f32 vec. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub fn download_complex_f32( device: &Arc, gpu_ptr: *const CuComplex, @@ -201,15 +203,17 @@ pub fn download_complex_f32( let mut host_buf = vec![0.0_f32; num_elements * 2]; unsafe { - let ret = cudarc::driver::sys::lib().cuMemcpyDtoH_v2( + let ret = crate::gpu::cuda_ffi::cudaMemcpy( host_buf.as_mut_ptr() as *mut _, - gpu_ptr as u64, + gpu_ptr as *const _, byte_count, + crate::gpu::cuda_ffi::CUDA_MEMCPY_DEVICE_TO_HOST, ); - if ret != cudarc::driver::sys::CUresult::CUDA_SUCCESS { + if ret != crate::gpu::cuda_ffi::CUDA_SUCCESS { return Err(MahoutError::Cuda(format!( - "cuMemcpyDtoH failed during f32 download: {:?}", - ret + "device-to-host copy failed during f32 download: {} ({})", + ret, + crate::error::cuda_error_to_string(ret) ))); } } diff --git a/qdp/qdp-core/src/gpu/mod.rs b/qdp/qdp-core/src/gpu/mod.rs index 75c987c4f6..6fb62757d3 100644 --- a/qdp/qdp-core/src/gpu/mod.rs +++ b/qdp/qdp-core/src/gpu/mod.rs @@ -14,9 +14,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub mod buffer_pool; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub(crate) mod cuda_sync; pub mod encodings; pub mod memory; @@ -24,22 +24,22 @@ pub mod memory; /// `tests/` can use them; not part of the supported runtime API. #[doc(hidden)] pub mod metrics; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub mod overlap_tracker; pub mod pipeline; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub mod pool_metrics; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub(crate) mod validation; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub(crate) mod cuda_ffi; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub use buffer_pool::{PinnedBufferHandle, PinnedBufferPool}; pub use encodings::{AmplitudeEncoder, AngleEncoder, BasisEncoder, QuantumEncoder}; pub use memory::GpuStateVector; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[doc(hidden)] pub use metrics::{download_complex_f32, download_complex_f64}; #[doc(hidden)] @@ -49,9 +49,9 @@ pub use metrics::{ }; pub use pipeline::run_dual_stream_pipeline; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub use overlap_tracker::OverlapTracker; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub use pipeline::PipelineContext; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub use pool_metrics::{PoolMetrics, PoolUtilizationReport}; diff --git a/qdp/qdp-core/src/gpu/overlap_tracker.rs b/qdp/qdp-core/src/gpu/overlap_tracker.rs index c18dcfdfa3..78bcfe9be6 100644 --- a/qdp/qdp-core/src/gpu/overlap_tracker.rs +++ b/qdp/qdp-core/src/gpu/overlap_tracker.rs @@ -24,7 +24,7 @@ use crate::gpu::cuda_ffi::{ CUDA_EVENT_DEFAULT, CUDA_SUCCESS, cudaEventCreateWithFlags, cudaEventDestroy, cudaEventElapsedTime, cudaEventRecord, cudaEventSynchronize, }; -use cudarc::driver::safe::CudaStream; +use crate::gpu_rt::CudaStream; use std::ffi::c_void; /// Tracks overlap between H2D copy and compute operations using CUDA events. diff --git a/qdp/qdp-core/src/gpu/pipeline.rs b/qdp/qdp-core/src/gpu/pipeline.rs index 6d31cbdacd..15f0067f1e 100644 --- a/qdp/qdp-core/src/gpu/pipeline.rs +++ b/qdp/qdp-core/src/gpu/pipeline.rs @@ -24,34 +24,34 @@ #![allow(unused_unsafe)] use crate::error::{MahoutError, Result}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::buffer_pool::{PinnedBufferHandle, PinnedBufferPool}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::cuda_ffi::{ CUDA_EVENT_DISABLE_TIMING, CUDA_MEMCPY_HOST_TO_DEVICE, cudaEventCreateWithFlags, cudaEventDestroy, cudaEventRecord, cudaMemcpyAsync, cudaStreamWaitEvent, }; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::cuda_sync::sync_cuda_stream; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::overlap_tracker::OverlapTracker; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::gpu::pool_metrics::PoolMetrics; -use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, DeviceRepr, safe::CudaStream}; +use crate::gpu_rt::{CudaDevice, CudaSlice, CudaStream, DevicePtr, DeviceRepr}; use std::ffi::c_void; use std::sync::Arc; /// Dual-stream context coordinating copy/compute with an event. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub struct PipelineContext { pub stream_compute: CudaStream, pub stream_copy: CudaStream, events_copy_done: Vec<*mut c_void>, } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn validate_event_slot(events: &[*mut c_void], slot: usize) -> Result<()> { if slot >= events.len() { return Err(MahoutError::InvalidInput(format!( @@ -63,7 +63,7 @@ fn validate_event_slot(events: &[*mut c_void], slot: usize) -> Result<()> { Ok(()) } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl PipelineContext { pub fn new(device: &Arc, event_slots: usize) -> Result { let stream_compute = device @@ -188,7 +188,7 @@ impl PipelineContext { } } -#[cfg(all(test, target_os = "linux"))] +#[cfg(all(test, qdp_gpu_platform))] mod tests { use super::validate_event_slot; @@ -207,7 +207,7 @@ mod tests { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] impl Drop for PipelineContext { fn drop(&mut self) { unsafe { @@ -249,7 +249,7 @@ impl Drop for PipelineContext { /// Ok(()) /// })?; /// ``` -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub fn run_dual_stream_pipeline( device: &Arc, host_data: &[f64], @@ -273,7 +273,7 @@ where /// /// `align_elements` must evenly divide the host data length and ensures chunks do not /// split logical records (e.g., per-sample data in batch encoding). -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(clippy::manual_is_multiple_of)] pub fn run_dual_stream_pipeline_aligned( device: &Arc, @@ -293,7 +293,7 @@ where } /// f32 variant of `run_dual_stream_pipeline_aligned`. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(clippy::manual_is_multiple_of)] pub fn run_dual_stream_pipeline_aligned_f32( device: &Arc, @@ -312,7 +312,7 @@ where ) } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(clippy::manual_is_multiple_of)] fn run_dual_stream_pipeline_aligned_typed( device: &Arc, @@ -354,7 +354,7 @@ where ) } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn run_dual_stream_pipeline_with_chunk_size( device: &Arc, host_data: &[T], diff --git a/qdp/qdp-core/src/gpu/validation.rs b/qdp/qdp-core/src/gpu/validation.rs index d7968599a8..ba56fdbbdf 100644 --- a/qdp/qdp-core/src/gpu/validation.rs +++ b/qdp/qdp-core/src/gpu/validation.rs @@ -23,7 +23,7 @@ #![allow(unused_unsafe)] use crate::error::{MahoutError, Result, cuda_error_to_string}; -use cudarc::driver::{CudaDevice, CudaSlice, DevicePtrMut}; +use crate::gpu_rt::{CudaDevice, CudaSlice, DevicePtrMut}; use std::ffi::c_void; use std::sync::Arc; diff --git a/qdp/qdp-core/src/gpu_rt.rs b/qdp/qdp-core/src/gpu_rt.rs new file mode 100644 index 0000000000..7f7f1048aa --- /dev/null +++ b/qdp/qdp-core/src/gpu_rt.rs @@ -0,0 +1,27 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Single import point for the device runtime types, vendor-selected at +//! compile time. `cudarc` has no ROCm backend, so on the `hip` build these +//! names resolve to the HIP shim in `qdp_kernels::device` instead of cudarc. +//! Both expose the same type names and method signatures, so call sites and +//! integration tests use `crate::gpu_rt::{...}` (or `qdp_core::gpu_rt::{...}`) +//! and compile unchanged on either vendor. + +pub use qdp_kernels::device::{ + CudaDevice, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, DeviceRepr, DeviceSlice, + ValidAsZeroBits, +}; diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs index 822fba96e1..4cfb4f079e 100644 --- a/qdp/qdp-core/src/lib.rs +++ b/qdp/qdp-core/src/lib.rs @@ -17,12 +17,18 @@ // Allow unused_unsafe: CUDA FFI and kernel functions are unsafe in CUDA builds but safe stubs in no-CUDA builds. // The compiler can't statically determine which path is taken. #![allow(unused_unsafe)] +// The `stream.stream as *mut c_void` casts at kernel-launch call sites are a +// real conversion on the CUDA backend (cudarc's CUstream is a distinct pointer +// type) but a no-op on HIP, where the stream field is already *mut c_void. Keep +// the cast for backend-agnostic call sites and silence the HIP-only redundancy. +#![cfg_attr(feature = "hip", allow(clippy::unnecessary_cast))] pub mod dlpack; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] mod encoding; pub mod error; pub mod gpu; +pub mod gpu_rt; pub mod io; mod platform; pub mod preprocessing; @@ -41,10 +47,10 @@ pub use reader::{FloatElem, NullHandling, handle_float32_nulls, handle_float64_n pub use types::{Dtype, Encoding}; // Throughput/latency pipeline runner: single path using QdpEngine and encode_batch in Rust. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] mod pipeline_runner; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub use pipeline_runner::{ PipelineConfig, PipelineIterator, PipelineRunResult, run_latency_pipeline, run_throughput_pipeline, @@ -54,9 +60,9 @@ use std::ffi::c_void; use std::sync::Arc; use crate::dlpack::DLManagedTensor; -use cudarc::driver::CudaDevice; +use crate::gpu_rt::CudaDevice; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> Result<()> { use crate::gpu::cuda_ffi::{ CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes, @@ -179,7 +185,7 @@ impl QdpEngine { /// Block until all GPU work on the default stream has completed. /// Used by the generic pipeline and other callers that need to sync before timing. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub fn synchronize(&self) -> Result<()> { self.device .synchronize() @@ -289,7 +295,7 @@ impl QdpEngine { /// * `host_data` - 1D input data (e.g. single sample for amplitude) /// * `num_qubits` - Number of qubits /// * `encoding_method` - Strategy (currently only "amplitude" supported for this path) - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub fn run_dual_stream_encode( &self, host_data: &[f64], @@ -497,7 +503,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `input_len` elements of the expected dtype /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr( &self, input_d: *const std::ffi::c_void, @@ -524,7 +530,7 @@ impl QdpEngine { /// # Safety /// Same as [`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr). Additionally, `stream` must /// be a valid CUDA stream on the same device as the engine, or null. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr_with_stream( &self, input_d: *const std::ffi::c_void, @@ -575,7 +581,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `input_len` f32 elements /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr_f32( &self, input_d: *const f32, @@ -600,7 +606,7 @@ impl QdpEngine { /// # Safety /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer /// must remain valid for the duration of this call. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_from_gpu_ptr_f32_with_stream( &self, input_d: *const f32, @@ -640,7 +646,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `input_len` f32 elements /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_angle_from_gpu_ptr_f32( &self, input_d: *const f32, @@ -662,7 +668,7 @@ impl QdpEngine { /// # Safety /// In addition to the `encode_angle_from_gpu_ptr_f32` requirements, the stream pointer /// must remain valid for the duration of this call. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_angle_from_gpu_ptr_f32_with_stream( &self, input_d: *const f32, @@ -702,7 +708,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `num_samples * sample_size` f32 elements /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr_f32( &self, input_batch_d: *const f32, @@ -726,7 +732,7 @@ impl QdpEngine { /// # Safety /// In addition to the `encode_batch_from_gpu_ptr_f32` requirements, the stream pointer /// must remain valid for the duration of this call. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream( &self, input_batch_d: *const f32, @@ -774,7 +780,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `num_samples * sample_size` f32 elements /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_angle_batch_from_gpu_ptr_f32( &self, input_batch_d: *const f32, @@ -798,7 +804,7 @@ impl QdpEngine { /// # Safety /// In addition to the `encode_angle_batch_from_gpu_ptr_f32` requirements, the stream pointer /// must remain valid for the duration of this call. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_angle_batch_from_gpu_ptr_f32_with_stream( &self, input_batch_d: *const f32, @@ -845,7 +851,7 @@ impl QdpEngine { /// The input pointer must: /// - Point to one valid f32 in GPU memory on the same device as the engine /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_basis_from_gpu_ptr_f32( &self, input_d: *const f32, @@ -866,7 +872,7 @@ impl QdpEngine { /// # Safety /// In addition to the `encode_basis_from_gpu_ptr_f32` requirements, the /// stream pointer must remain valid for the duration of this call. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_basis_from_gpu_ptr_f32_with_stream( &self, input_d: *const f32, @@ -903,7 +909,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `num_samples` f32 elements /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_basis_batch_from_gpu_ptr_f32( &self, input_batch_d: *const f32, @@ -928,7 +934,7 @@ impl QdpEngine { /// # Safety /// In addition to the `encode_basis_batch_from_gpu_ptr_f32` requirements, /// the stream pointer must remain valid for the duration of this call. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_basis_batch_from_gpu_ptr_f32_with_stream( &self, input_batch_d: *const f32, @@ -985,7 +991,7 @@ impl QdpEngine { /// - Point to valid GPU memory on the same device as the engine /// - Contain at least `num_samples * sample_size` elements of the expected dtype /// - Remain valid for the duration of this call - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr( &self, input_batch_d: *const std::ffi::c_void, @@ -1014,7 +1020,7 @@ impl QdpEngine { /// # Safety /// Same as [`encode_batch_from_gpu_ptr`](Self::encode_batch_from_gpu_ptr). Additionally, /// `stream` must be a valid CUDA stream on the same device as the engine, or null. - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] pub unsafe fn encode_batch_from_gpu_ptr_with_stream( &self, input_batch_d: *const std::ffi::c_void, diff --git a/qdp/qdp-core/src/pipeline_runner.rs b/qdp/qdp-core/src/pipeline_runner.rs index 71b95d5d15..ef8e130a7e 100644 --- a/qdp/qdp-core/src/pipeline_runner.rs +++ b/qdp/qdp-core/src/pipeline_runner.rs @@ -836,7 +836,7 @@ pub fn run_throughput_pipeline(config: &PipelineConfig) -> Result Option> { CudaDevice::new(0).ok() } /// Returns a QDP engine, or `None` when GPU-backed engine initialization is unavailable. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(dead_code)] pub fn qdp_engine() -> Option { QdpEngine::new(0).ok() } /// Returns a QDP engine with the requested precision, or `None` when unavailable. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(dead_code)] pub fn qdp_engine_with_precision(precision: Precision) -> Option { QdpEngine::new_with_precision(0, precision).ok() } /// Copies f64 host data to the default CUDA device, or returns `None` when unavailable. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(dead_code)] pub fn copy_f64_to_device(data: &[f64]) -> Option<(Arc, CudaSlice)> { let device = cuda_device()?; @@ -112,7 +112,7 @@ pub fn copy_f64_to_device(data: &[f64]) -> Option<(Arc, CudaSlice Option<(Arc, CudaSlice)> { let device = cuda_device()?; @@ -121,7 +121,7 @@ pub fn copy_f32_to_device(data: &[f32]) -> Option<(Arc, CudaSlice Option<(Arc, CudaSlice)> { let device = cuda_device()?; @@ -130,7 +130,7 @@ pub fn copy_usize_to_device(data: &[usize]) -> Option<(Arc, CudaSlic } /// Asserts a DLPack tensor is 2D with the expected shape. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(dead_code)] pub unsafe fn assert_dlpack_shape_2d(dlpack_ptr: *mut DLManagedTensor, dim0: i64, dim1: i64) { assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null"); @@ -144,7 +144,7 @@ pub unsafe fn assert_dlpack_shape_2d(dlpack_ptr: *mut DLManagedTensor, dim0: i64 } /// Asserts a DLPack tensor is 2D with the expected shape and then frees it via its deleter. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(dead_code)] pub unsafe fn assert_dlpack_shape_2d_and_delete( dlpack_ptr: *mut DLManagedTensor, @@ -157,7 +157,7 @@ pub unsafe fn assert_dlpack_shape_2d_and_delete( } /// Takes the DLPack deleter from the managed tensor and invokes it exactly once. -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[allow(dead_code)] pub unsafe fn take_deleter_and_delete(dlpack_ptr: *mut DLManagedTensor) { assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null"); diff --git a/qdp/qdp-core/tests/gpu_angle_encoding.rs b/qdp/qdp-core/tests/gpu_angle_encoding.rs index 874e948c9e..4c3d48cc44 100644 --- a/qdp/qdp-core/tests/gpu_angle_encoding.rs +++ b/qdp/qdp-core/tests/gpu_angle_encoding.rs @@ -20,7 +20,7 @@ // tests inside encoding/angle.rs because they cannot be triggered via a Parquet // file. -#![cfg(target_os = "linux")] +#![cfg(qdp_gpu_platform)] use qdp_core::MahoutError; diff --git a/qdp/qdp-core/tests/gpu_api_workflow.rs b/qdp/qdp-core/tests/gpu_api_workflow.rs index 6c8e651669..47d18b79b7 100644 --- a/qdp/qdp-core/tests/gpu_api_workflow.rs +++ b/qdp/qdp-core/tests/gpu_api_workflow.rs @@ -16,16 +16,16 @@ // API workflow tests: Engine initialization and encoding -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use qdp_core::MahoutError; use qdp_core::QdpEngine; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use qdp_core::gpu::pipeline::run_dual_stream_pipeline_aligned; mod common; #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_engine_initialization() { println!("Testing QdpEngine initialization..."); @@ -46,7 +46,7 @@ fn test_engine_initialization() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encoding_workflow() { println!("Testing amplitude encoding workflow..."); @@ -72,7 +72,7 @@ fn test_amplitude_encoding_workflow() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encoding_async_pipeline() { println!("Testing amplitude encoding async pipeline path..."); @@ -98,7 +98,7 @@ fn test_amplitude_encoding_async_pipeline() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encoding_async_pipeline() { println!("Testing angle encoding async pipeline path..."); @@ -125,7 +125,7 @@ fn test_angle_encoding_async_pipeline() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_async_alignment_error() { println!("Testing angle async pipeline alignment error..."); @@ -153,7 +153,7 @@ fn test_angle_async_alignment_error() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_batch_dlpack_2d_shape() { println!("Testing batch DLPack 2D shape..."); @@ -211,7 +211,7 @@ fn test_batch_dlpack_2d_shape() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_single_encode_dlpack_2d_shape() { println!("Testing single encode returns 2D shape..."); @@ -249,7 +249,7 @@ fn test_single_encode_dlpack_2d_shape() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_dlpack_device_id() { println!("Testing DLPack device_id propagation..."); @@ -275,12 +275,18 @@ fn test_dlpack_device_id() { "device_id should be 0 for device 0" ); - // Verify device_type is CUDA (kDLCUDA = 2) + // Verify device_type matches the build's backend: kDLCUDA (2) on the + // NVIDIA build, kDLROCM (10) on the HIP build (a ROCm PyTorch's + // from_dlpack requires the ROCm tag). use qdp_core::dlpack::DLDeviceType; - match tensor.device.device_type { - DLDeviceType::kDLCUDA => println!("PASS: Device type is CUDA"), - _ => panic!("Expected CUDA device type"), - } + #[cfg(not(feature = "hip"))] + let expected = DLDeviceType::kDLCUDA; + #[cfg(feature = "hip")] + let expected = DLDeviceType::kDLROCM; + assert_eq!( + tensor.device.device_type, expected, + "DLPack device_type should match the build backend" + ); println!( "PASS: DLPack device_id correctly set to {}", diff --git a/qdp/qdp-core/tests/gpu_basis_encoding.rs b/qdp/qdp-core/tests/gpu_basis_encoding.rs index e8e9244ce9..1dfb545477 100644 --- a/qdp/qdp-core/tests/gpu_basis_encoding.rs +++ b/qdp/qdp-core/tests/gpu_basis_encoding.rs @@ -19,7 +19,7 @@ // pipeline path. The needs_staging_copy() unit test lives inside encoding/basis.rs // because BasisEncoder is pub(crate) and not accessible from here. -#![cfg(target_os = "linux")] +#![cfg(qdp_gpu_platform)] use qdp_core::MahoutError; diff --git a/qdp/qdp-core/tests/gpu_dlpack.rs b/qdp/qdp-core/tests/gpu_dlpack.rs index f84bcd38ef..439e3e5cfc 100644 --- a/qdp/qdp-core/tests/gpu_dlpack.rs +++ b/qdp/qdp-core/tests/gpu_dlpack.rs @@ -71,7 +71,7 @@ mod dlpack_tests { } #[test] - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn test_dlpack_single_shape_f32() { let Some(device) = common::cuda_device() else { return; @@ -97,7 +97,7 @@ mod dlpack_tests { } #[test] - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn test_dlpack_batch_shape_f32() { let Some(device) = common::cuda_device() else { return; @@ -142,7 +142,7 @@ mod dlpack_tests { /// synchronize_stream(CUDA_STREAM_LEGACY) syncs the legacy default stream (Linux + CUDA). #[test] - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] fn test_synchronize_stream_legacy() { if common::cuda_device().is_none() { return; diff --git a/qdp/qdp-core/tests/gpu_fidelity.rs b/qdp/qdp-core/tests/gpu_fidelity.rs index cd38a5c97c..8ef39482b1 100644 --- a/qdp/qdp-core/tests/gpu_fidelity.rs +++ b/qdp/qdp-core/tests/gpu_fidelity.rs @@ -17,7 +17,7 @@ //! Tests for fidelity / trace-distance metrics and F32 vs F64 precision //! comparison across different qubit counts. -#![cfg(target_os = "linux")] +#![cfg(qdp_gpu_platform)] use approx::assert_relative_eq; use qdp_core::gpu::metrics::{ diff --git a/qdp/qdp-core/tests/gpu_iqp_encoding.rs b/qdp/qdp-core/tests/gpu_iqp_encoding.rs index 4954ab5b38..c82e292dc1 100644 --- a/qdp/qdp-core/tests/gpu_iqp_encoding.rs +++ b/qdp/qdp-core/tests/gpu_iqp_encoding.rs @@ -36,7 +36,7 @@ fn iqp_z_data_len(num_qubits: usize) -> usize { // ============================================================================= #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_zero_qubits_rejected() { println!("Testing IQP zero qubits rejection..."); @@ -61,7 +61,7 @@ fn test_iqp_zero_qubits_rejected() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_max_qubits_exceeded() { println!("Testing IQP max qubits (>{MAX_QUBITS}) rejection..."); @@ -87,7 +87,7 @@ fn test_iqp_max_qubits_exceeded() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_wrong_data_length() { println!("Testing IQP wrong data length rejection..."); @@ -124,7 +124,7 @@ fn test_iqp_wrong_data_length() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_z_wrong_data_length() { println!("Testing IQP-Z wrong data length rejection..."); @@ -153,7 +153,7 @@ fn test_iqp_z_wrong_data_length() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_nan_value_rejected() { println!("Testing IQP NaN value rejection..."); @@ -181,7 +181,7 @@ fn test_iqp_nan_value_rejected() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_infinity_value_rejected() { println!("Testing IQP infinity value rejection..."); @@ -213,7 +213,7 @@ fn test_iqp_infinity_value_rejected() { // ============================================================================= #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_full_encoding_workflow() { println!("Testing IQP full encoding workflow..."); @@ -259,7 +259,7 @@ fn test_iqp_full_encoding_workflow() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_z_encoding_workflow() { println!("Testing IQP-Z encoding workflow..."); @@ -304,7 +304,7 @@ fn test_iqp_z_encoding_workflow() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_single_qubit() { println!("Testing IQP single qubit encoding..."); @@ -344,7 +344,7 @@ fn test_iqp_single_qubit() { // ============================================================================= #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_batch_encoding() { println!("Testing IQP batch encoding..."); @@ -394,7 +394,7 @@ fn test_iqp_batch_encoding() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_z_batch_encoding() { println!("Testing IQP-Z batch encoding..."); @@ -444,7 +444,7 @@ fn test_iqp_z_batch_encoding() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_batch_wrong_sample_size() { println!("Testing IQP batch wrong sample_size rejection..."); @@ -480,7 +480,7 @@ fn test_iqp_batch_wrong_sample_size() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_batch_data_length_mismatch() { println!("Testing IQP batch data length mismatch rejection..."); @@ -511,7 +511,7 @@ fn test_iqp_batch_data_length_mismatch() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_batch_nan_in_sample() { println!("Testing IQP batch NaN value rejection..."); @@ -571,7 +571,7 @@ fn test_iqp_data_length_calculations() { // ============================================================================= #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_fwt_threshold_boundary() { println!("Testing IQP FWT threshold boundary (n=4, where FWT kicks in)..."); @@ -614,7 +614,7 @@ fn test_iqp_fwt_threshold_boundary() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_fwt_larger_qubit_counts() { println!("Testing IQP FWT with larger qubit counts (n=5,6,7,8)..."); @@ -660,7 +660,7 @@ fn test_iqp_fwt_larger_qubit_counts() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_z_fwt_correctness() { println!("Testing IQP-Z FWT correctness for various qubit counts..."); @@ -702,7 +702,7 @@ fn test_iqp_z_fwt_correctness() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_fwt_batch_various_sizes() { println!("Testing IQP FWT batch encoding with various qubit counts..."); @@ -752,7 +752,7 @@ fn test_iqp_fwt_batch_various_sizes() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_fwt_zero_parameters_identity() { println!("Testing IQP FWT with zero parameters produces |0⟩ state..."); @@ -795,7 +795,7 @@ fn test_iqp_fwt_zero_parameters_identity() { // ============================================================================= #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_encoder_via_factory() { println!("Testing IQP encoder creation via Encoding::from_str_ci / encode..."); @@ -834,7 +834,7 @@ fn test_iqp_encoder_via_factory() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_iqp_z_encoder_via_factory() { println!("Testing IQP-Z encoder creation via encode..."); diff --git a/qdp/qdp-core/tests/gpu_memory_safety.rs b/qdp/qdp-core/tests/gpu_memory_safety.rs index 33e937a37d..fa81972589 100644 --- a/qdp/qdp-core/tests/gpu_memory_safety.rs +++ b/qdp/qdp-core/tests/gpu_memory_safety.rs @@ -21,7 +21,7 @@ use qdp_core::Precision; mod common; #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_memory_pressure() { println!("Testing memory pressure (leak detection)"); println!("Running 100 iterations of encode + free"); @@ -51,7 +51,7 @@ fn test_memory_pressure() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_multiple_concurrent_states() { println!("Testing multiple concurrent state vectors..."); @@ -81,7 +81,7 @@ fn test_multiple_concurrent_states() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_dlpack_tensor_metadata_default() { println!("Testing DLPack tensor metadata..."); @@ -131,7 +131,7 @@ fn test_dlpack_tensor_metadata_default() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_dlpack_tensor_metadata_f64() { println!("Testing DLPack tensor metadata..."); diff --git a/qdp/qdp-core/tests/gpu_norm_f32.rs b/qdp/qdp-core/tests/gpu_norm_f32.rs index af1150881b..294d093bdd 100644 --- a/qdp/qdp-core/tests/gpu_norm_f32.rs +++ b/qdp/qdp-core/tests/gpu_norm_f32.rs @@ -18,11 +18,11 @@ // Tests for GPU-side f32 L2 norm helper in AmplitudeEncoder. // -#![cfg(target_os = "linux")] +#![cfg(qdp_gpu_platform)] use approx::assert_relative_eq; -use cudarc::driver::DevicePtr; use qdp_core::gpu::encodings::amplitude::AmplitudeEncoder; +use qdp_core::gpu_rt::DevicePtr; mod common; diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs b/qdp/qdp-core/tests/gpu_ptr_encoding.rs index 48ba65f847..b5fed7c7fa 100644 --- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs +++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs @@ -16,9 +16,13 @@ // Unit and integration tests for encode_from_gpu_ptr and encode_batch_from_gpu_ptr. -#![cfg(target_os = "linux")] +#![cfg(qdp_gpu_platform)] +// The `stream.stream as *mut c_void` cast is a real conversion on the CUDA +// backend (cudarc's CUstream is a distinct pointer type) but a no-op on HIP, +// where the stream field is already *mut c_void; silence the HIP-only redundancy. +#![cfg_attr(feature = "hip", allow(clippy::unnecessary_cast))] -use cudarc::driver::{DevicePtr, DeviceSlice}; +use qdp_core::gpu_rt::{DevicePtr, DeviceSlice}; use qdp_core::{MahoutError, Precision, QdpEngine}; use std::ffi::c_void; diff --git a/qdp/qdp-core/tests/gpu_validation.rs b/qdp/qdp-core/tests/gpu_validation.rs index 291f92dce8..4f0d1a879e 100644 --- a/qdp/qdp-core/tests/gpu_validation.rs +++ b/qdp/qdp-core/tests/gpu_validation.rs @@ -22,7 +22,7 @@ use qdp_core::gpu::encodings::MAX_QUBITS; mod common; #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_input_validation_invalid_strategy() { println!("Testing invalid strategy name rejection..."); @@ -48,7 +48,7 @@ fn test_input_validation_invalid_strategy() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_input_validation_qubit_mismatch() { println!("Testing qubit size validation..."); @@ -78,7 +78,7 @@ fn test_input_validation_qubit_mismatch() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_input_validation_zero_qubits() { println!("Testing zero qubits rejection..."); @@ -104,7 +104,7 @@ fn test_input_validation_zero_qubits() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_input_validation_max_qubits() { println!("Testing maximum qubit limit ({MAX_QUBITS})..."); @@ -130,7 +130,7 @@ fn test_input_validation_max_qubits() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_input_validation_batch_zero_samples() { println!("Testing zero num_samples rejection..."); @@ -155,7 +155,7 @@ fn test_input_validation_batch_zero_samples() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_empty_data() { println!("Testing empty data rejection..."); @@ -178,7 +178,7 @@ fn test_empty_data() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_zero_norm_data() { println!("Testing zero-norm data rejection..."); @@ -210,7 +210,7 @@ fn test_error_types() { } #[test] -#[cfg(not(target_os = "linux"))] +#[cfg(not(qdp_gpu_platform))] fn test_non_linux_graceful_failure() { let result = QdpEngine::new(0); assert!(result.is_err()); diff --git a/qdp/qdp-kernels/Cargo.toml b/qdp/qdp-kernels/Cargo.toml index dcc7c0ec05..6da30e0737 100644 --- a/qdp/qdp-kernels/Cargo.toml +++ b/qdp/qdp-kernels/Cargo.toml @@ -4,11 +4,21 @@ version.workspace = true edition.workspace = true [dependencies] -cudarc = { workspace = true } +cudarc = { workspace = true, optional = true } [build-dependencies] cc = { workspace = true } +[features] +# `cuda` and `hip` are mutually exclusive; pick exactly one vendor backend. +# If both end up enabled (e.g. via workspace feature unification), `hip` takes +# precedence: kernels build for HIP and cudarc is compiled but unused. +# Default build: NVIDIA CUDA via cudarc + nvcc-compiled kernels. +default = ["cuda"] +cuda = ["dep:cudarc"] +# AMD build: hipcc-compiled kernels + the in-crate HIP device traits; no cudarc. +hip = [] + [lib] name = "qdp_kernels" crate-type = ["rlib", "staticlib"] diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs index def59d6935..ae867e6b06 100644 --- a/qdp/qdp-kernels/build.rs +++ b/qdp/qdp-kernels/build.rs @@ -26,6 +26,20 @@ use std::env; use std::process::Command; +const KERNEL_SOURCES: &[&str] = &[ + "src/amplitude.cu", + "src/basis.cu", + "src/angle.cu", + "src/validation.cu", + "src/iqp.cu", + "src/phase.cu", +]; + +/// Default AMD GPU target used only when QDP_HIP_ARCH_LIST is unset. Never hardcode +/// this as the sole arch in a way that overrides the env list: other AMD targets +/// (gfx1100, gfx1151) must build the same source by setting QDP_HIP_ARCH_LIST alone. +const DEFAULT_HIP_ARCH: &str = "gfx90a"; + const DEFAULT_CUBIN_ARCHES: &[&str] = &["75", "80", "86", "89", "90", "100", "120"]; const DEFAULT_PTX_CANDIDATES: &[&str] = &["120", "100", "90", "89", "86", "80", "75"]; const LEGACY_FALLBACK_ARCHES: &[&str] = &["75", "80", "86"]; @@ -154,9 +168,122 @@ fn apply_default_arch_targets(build: &mut cc::Build) { } } +fn qdp_use_hip_env() -> bool { + env::var("QDP_USE_HIP") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("yes")) + .unwrap_or(false) +} + +fn hip_requested() -> bool { + cfg!(feature = "hip") || qdp_use_hip_env() +} + +/// Reject a kernel/host backend mismatch before it produces a broken binary. +/// +/// QDP_USE_HIP=1 flips the KERNEL build to hipcc, while the HOST runtime is +/// chosen by the `hip` Cargo feature (cudarc when off, the HIP shim when on). +/// Setting QDP_USE_HIP=1 with the `hip` feature OFF would build AMD device code +/// (hipcc) against the cudarc host backend, which fails or misbehaves at runtime; +/// reject that loudly. The reverse (the `hip` feature on, QDP_USE_HIP unset) is +/// NOT a mismatch: `hip_requested()` already builds the kernels for HIP whenever +/// the feature is on, so the host and kernels agree -- no panic there. +fn check_hip_consistency() { + let env_hip = qdp_use_hip_env(); + let feature_hip = env::var("CARGO_FEATURE_HIP").is_ok(); + if env_hip && !feature_hip { + panic!( + "QDP_USE_HIP is set but the `hip` Cargo feature is off: this would \ + build AMD kernels (hipcc) against the cudarc host backend. \ + Add `--features hip` (and `--no-default-features` to drop cudarc)." + ); + } +} + +/// Compile the kernels with hipcc for AMD GPUs. +/// +/// Mirrors the CUDA branch in spirit (same six .cu sources, same `src/` include +/// for kernel_config.h) but: targets are AMD `--offload-arch` values from +/// QDP_HIP_ARCH_LIST (default gfx90a only when unset), the hip_compat/ shim dir +/// is added to the include path so the sources' `` / +/// `` / `` resolve to HIP equivalents, and the +/// link library is amdhip64 instead of cudart. The CUDA path is untouched. +fn build_hip() { + let hipcc = env::var("QDP_HIPCC").unwrap_or_else(|_| "hipcc".to_string()); + + // Degrade gracefully when the ROCm toolchain is absent, mirroring the + // CUDA branch's nvcc-not-found path. This lets `cargo check`/`clippy` + // (including `--all-features`, which turns the `hip` feature on) succeed in + // a ROCm-less CI runner: emit the qdp_no_cuda stub cfg and skip compilation + // instead of letting hipcc fail the build. + let has_hipcc = Command::new(&hipcc).arg("--version").output().is_ok(); + if !has_hipcc { + println!("cargo:rustc-cfg=qdp_no_cuda"); + println!( + "cargo:warning=ROCm/hipcc not found ('{hipcc}' not runnable). Skipping kernel compilation." + ); + println!( + "cargo:warning=This is expected in environments without the ROCm toolkit installed." + ); + println!( + "cargo:warning=The project will build against host stubs, but GPU functionality will not be available." + ); + return; + } + + let mut build = cc::Build::new(); + build.compiler(&hipcc); + build.cpp(true); + // hip_compat/ first so its cuda_runtime.h / cuComplex.h / vector_types.h win; + // src/ for kernel_config.h and kernel_compat.h. + build.include("hip_compat"); + build.include("src"); + build.flag("-std=c++17"); + build.flag("-x").flag("hip"); + + let arch_list = env::var("QDP_HIP_ARCH_LIST").unwrap_or_else(|_| DEFAULT_HIP_ARCH.to_string()); + let mut saw_arch = false; + for entry in arch_list.split(',') { + let arch = entry.trim(); + if arch.is_empty() { + continue; + } + build.flag(format!("--offload-arch={arch}")); + saw_arch = true; + } + if !saw_arch { + build.flag(format!("--offload-arch={DEFAULT_HIP_ARCH}")); + } + + for src in KERNEL_SOURCES { + build.file(src); + } + build.compile("kernels"); + + // Link the HIP runtime. Honor an explicit ROCM_PATH; otherwise rely on the + // default loader search path (hipcc-built objects pull libamdhip64 there). + if let Ok(rocm) = env::var("ROCM_PATH") { + println!("cargo:rustc-link-search=native={rocm}/lib"); + } + println!("cargo:rustc-link-lib=amdhip64"); +} + fn main() { // Let rustc know about our build-script-defined cfg flags (avoids `unexpected_cfgs` warnings). println!("cargo::rustc-check-cfg=cfg(qdp_no_cuda)"); + // Emit qdp_gpu_platform when building for a GPU-capable OS (Linux always; + // Windows when the hip feature is on via QDP_USE_HIP=1 / TheRock ROCm). + println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)"); + + // Reject a kernel/host backend mismatch (QDP_USE_HIP vs the `hip` feature) + // before compiling anything. + check_hip_consistency(); + + let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux"); + let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok(); + let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows"); + if is_linux || (is_windows && hip_feature) { + println!("cargo::rustc-cfg=qdp_gpu_platform"); + } // Tell Cargo to rerun this script if the kernel sources change println!("cargo:rerun-if-changed=src/amplitude.cu"); @@ -167,7 +294,21 @@ fn main() { println!("cargo:rerun-if-changed=src/phase.cu"); println!("cargo:rerun-if-env-changed=QDP_NO_CUDA"); println!("cargo:rerun-if-env-changed=QDP_CUDA_ARCH_LIST"); + println!("cargo:rerun-if-env-changed=QDP_USE_HIP"); + println!("cargo:rerun-if-env-changed=QDP_HIP_ARCH_LIST"); + println!("cargo:rerun-if-env-changed=QDP_HIPCC"); println!("cargo:rerun-if-changed=src/kernel_config.h"); + println!("cargo:rerun-if-changed=src/kernel_compat.h"); + println!("cargo:rerun-if-changed=hip_compat/cuda_runtime.h"); + println!("cargo:rerun-if-changed=hip_compat/cuComplex.h"); + println!("cargo:rerun-if-changed=hip_compat/vector_types.h"); + + // AMD/HIP build path: compile the same .cu sources with hipcc. Gated by the + // `hip` Cargo feature or QDP_USE_HIP=1; the CUDA path below is unchanged when off. + if hip_requested() { + build_hip(); + return; + } // Check if CUDA is available by looking for nvcc let force_no_cuda = env::var("QDP_NO_CUDA") diff --git a/qdp/qdp-kernels/hip_compat/cuComplex.h b/qdp/qdp-kernels/hip_compat/cuComplex.h new file mode 100644 index 0000000000..0859771e4a --- /dev/null +++ b/qdp/qdp-kernels/hip_compat/cuComplex.h @@ -0,0 +1,46 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// HIP forwarding shim for (HIP build path only; see +// cuda_runtime.h in this directory for how it is selected). hipcc does not +// ship a ; HIP's provides the same complex +// layout and helpers under hip* names. The aliases below let the .cu sources +// keep their cuComplex / cuDoubleComplex / make_cu* / cuC* spellings unchanged. + +#pragma once +#include + +typedef hipDoubleComplex cuDoubleComplex; +typedef hipFloatComplex cuComplex; + +#define make_cuDoubleComplex make_hipDoubleComplex +#define make_cuComplex make_hipFloatComplex + +// The kernels call cuCreal/cuCimag/cuCadd/cuCsub only on cuDoubleComplex, so +// alias to HIP's double-precision helpers (hipC*), not the float (hipC*f) set. +#define cuCreal cuCreal_double +#define cuCimag cuCimag_double +#define cuCadd cuCadd_double +#define cuCsub cuCsub_double +#define cuCmul cuCmul_double +#define cuConj cuConj_double + +static __host__ __device__ inline double cuCreal_double(hipDoubleComplex z) { return hipCreal(z); } +static __host__ __device__ inline double cuCimag_double(hipDoubleComplex z) { return hipCimag(z); } +static __host__ __device__ inline hipDoubleComplex cuCadd_double(hipDoubleComplex a, hipDoubleComplex b) { return hipCadd(a, b); } +static __host__ __device__ inline hipDoubleComplex cuCsub_double(hipDoubleComplex a, hipDoubleComplex b) { return hipCsub(a, b); } +static __host__ __device__ inline hipDoubleComplex cuCmul_double(hipDoubleComplex a, hipDoubleComplex b) { return hipCmul(a, b); } +static __host__ __device__ inline hipDoubleComplex cuConj_double(hipDoubleComplex z) { return hipConj(z); } diff --git a/qdp/qdp-kernels/hip_compat/cuda_runtime.h b/qdp/qdp-kernels/hip_compat/cuda_runtime.h new file mode 100644 index 0000000000..144f1deb24 --- /dev/null +++ b/qdp/qdp-kernels/hip_compat/cuda_runtime.h @@ -0,0 +1,45 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// HIP forwarding shim for . +// +// This file exists ONLY on the HIP build path: qdp-kernels/build.rs adds the +// hip_compat/ directory to the include search path exclusively when compiling +// with hipcc, so a CUDA build never sees it and pulls the real toolkit header +// instead. The .cu sources keep their original `#include ` +// spelling; this header maps the small set of cuda* runtime symbols the +// kernels reference to their hip* equivalents (HIP error codes match CUDA's +// numerically for these codes). + +#pragma once +#include + +// MSVC does not define POSIX math constants unless _USE_MATH_DEFINES +// is set before the first system include. Provide the one the kernels use. +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.7071067811865475244008443621 +#endif + +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaStream_t hipStream_t +#define cudaGetLastError hipGetLastError +#define cudaGetDevice hipGetDevice +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#define cudaDevAttrMaxGridDimX hipDeviceAttributeMaxGridDimX +#define cudaMemsetAsync hipMemsetAsync +#define cudaMalloc hipMalloc diff --git a/qdp/qdp-kernels/hip_compat/vector_types.h b/qdp/qdp-kernels/hip_compat/vector_types.h new file mode 100644 index 0000000000..a48bbe8cd3 --- /dev/null +++ b/qdp/qdp-kernels/hip_compat/vector_types.h @@ -0,0 +1,23 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// HIP forwarding shim for (HIP build path only). HIP defines +// double2 / float2 etc. via , which is already pulled in by +// the cuda_runtime.h shim in this directory, so this header only needs to +// exist for the `#include ` line to resolve. + +#pragma once +#include diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu index 57fa4320cf..a944f2c72e 100644 --- a/qdp/qdp-kernels/src/amplitude.cu +++ b/qdp/qdp-kernels/src/amplitude.cu @@ -22,6 +22,7 @@ #include #include #include "kernel_config.h" +#include "kernel_compat.h" __global__ void amplitude_encode_kernel( const double* __restrict__ input, @@ -99,7 +100,7 @@ __global__ void amplitude_encode_kernel_f32( // Warp-level reduction for sum using shuffle instructions __device__ __forceinline__ double warp_reduce_sum(double val) { for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - val += __shfl_down_sync(0xffffffff, val, offset); + val += __shfl_down_sync(QDP_FULL_WARP_MASK, val, offset); } return val; } @@ -107,7 +108,7 @@ __device__ __forceinline__ double warp_reduce_sum(double val) { // Warp-level reduction for sum using shuffle instructions (float32) __device__ __forceinline__ float warp_reduce_sum_f32(float val) { for (int offset = warpSize / 2; offset > 0; offset >>= 1) { - val += __shfl_down_sync(0xffffffff, val, offset); + val += __shfl_down_sync(QDP_FULL_WARP_MASK, val, offset); } return val; } @@ -116,7 +117,10 @@ __device__ __forceinline__ float warp_reduce_sum_f32(float val) { __device__ __forceinline__ double block_reduce_sum(double val) { __shared__ double shared[32]; // supports up to 1024 threads (32 warps) int lane = threadIdx.x & (warpSize - 1); - int warp_id = threadIdx.x >> 5; + // warpSize is 32 on NVIDIA/RDNA and 64 on CDNA (gfx90a); derive the warp id + // from it rather than a hardcoded >> 5 so the per-warp partial lands in the + // slot the final reduction reads on every wave width. + int warp_id = threadIdx.x / warpSize; val = warp_reduce_sum(val); if (lane == 0) { @@ -137,7 +141,10 @@ __device__ __forceinline__ double block_reduce_sum(double val) { __device__ __forceinline__ float block_reduce_sum_f32(float val) { __shared__ float shared[32]; // supports up to 1024 threads (32 warps) int lane = threadIdx.x & (warpSize - 1); - int warp_id = threadIdx.x >> 5; + // warpSize is 32 on NVIDIA/RDNA and 64 on CDNA (gfx90a); derive the warp id + // from it rather than a hardcoded >> 5 so the per-warp partial lands in the + // slot the final reduction reads on every wave width. + int warp_id = threadIdx.x / warpSize; val = warp_reduce_sum_f32(val); if (lane == 0) { diff --git a/qdp/qdp-kernels/src/device.rs b/qdp/qdp-kernels/src/device.rs new file mode 100644 index 0000000000..563e965deb --- /dev/null +++ b/qdp/qdp-kernels/src/device.rs @@ -0,0 +1,443 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Device runtime surface, vendor-selected at compile time. +//! +//! `cudarc` is CUDA-only with no ROCm backend, so the AMD build cannot depend +//! on it. This module is the seam: on the default (`cuda`) feature it simply +//! re-exports the slice of `cudarc::driver` the crates use; on the `hip` +//! feature it provides a thin HIP-runtime shim with the SAME type names and +//! method signatures, so every call site (`device.alloc`, `htod_sync_copy`, +//! `slice.device_ptr()`, ...) compiles unchanged on both vendors. +//! +//! The marker traits `DeviceRepr` / `ValidAsZeroBits` live here (not in +//! qdp-core) because qdp-kernels implements them on its complex structs and is +//! the lowest crate in the workspace. + +#[cfg(not(any(feature = "cuda", feature = "hip")))] +compile_error!("qdp-kernels requires exactly one of the `cuda` or `hip` features"); + +#[cfg(all(feature = "cuda", not(feature = "hip")))] +pub use cudarc::driver::{ + CudaDevice, CudaSlice, DevicePtr, DevicePtrMut, DeviceRepr, DeviceSlice, ValidAsZeroBits, + safe::CudaStream, +}; + +#[cfg(feature = "hip")] +pub use hip::{ + CudaDevice, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, DeviceRepr, DeviceSlice, + DriverError, ValidAsZeroBits, +}; + +#[cfg(feature = "hip")] +mod hip { + use std::ffi::c_void; + use std::marker::PhantomData; + use std::sync::Arc; + + // ---- HIP runtime FFI (subset used by the device abstraction) ---- + // hip* names map 1:1 to the cuda* runtime entry points cudarc wraps; HIP + // error codes match CUDA's numerically for the codes we surface. + #[allow(non_camel_case_types)] + type hipError_t = i32; + + const HIP_SUCCESS: hipError_t = 0; + const HIP_MEMCPY_HOST_TO_DEVICE: u32 = 1; + const HIP_MEMCPY_DEVICE_TO_HOST: u32 = 2; + // hipStreamNonBlocking: the new stream does not implicitly synchronize with + // the NULL/default stream, matching cudarc's fork_default_stream. + const HIP_STREAM_NON_BLOCKING: u32 = 1; + + unsafe extern "C" { + fn hipSetDevice(device: i32) -> hipError_t; + fn hipGetDeviceCount(count: *mut i32) -> hipError_t; + fn hipMalloc(ptr: *mut *mut c_void, size: usize) -> hipError_t; + fn hipFree(ptr: *mut c_void) -> hipError_t; + fn hipMemset(ptr: *mut c_void, value: i32, size: usize) -> hipError_t; + fn hipMemcpy(dst: *mut c_void, src: *const c_void, size: usize, kind: u32) -> hipError_t; + fn hipDeviceSynchronize() -> hipError_t; + fn hipStreamCreateWithFlags(stream: *mut *mut c_void, flags: u32) -> hipError_t; + fn hipStreamDestroy(stream: *mut c_void) -> hipError_t; + fn hipStreamSynchronize(stream: *mut c_void) -> hipError_t; + } + + /// Mirrors the role of `cudarc::driver::DriverError`: an opaque, `Debug`able + /// wrapper over a runtime status code. Call sites only ever `{:?}`-format it. + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + pub struct DriverError(pub hipError_t); + + fn check(code: hipError_t) -> Result<(), DriverError> { + if code == HIP_SUCCESS { + Ok(()) + } else { + Err(DriverError(code)) + } + } + + /// Synchronize the NULL/default stream so its prior work is ordered before any + /// other stream observes the affected buffers. + /// + /// The blocking shim copies (htod/alloc_zeros) issue hipMemcpy/hipMemset on the + /// default (NULL) stream. CUDA's legacy default stream is synchronizing, so on + /// NVIDIA that work is implicitly ordered before a kernel launched on a forked + /// non-blocking stream that reads the same buffer. HIP's default stream is NOT + /// synchronizing relative to a hipStreamNonBlocking stream, so without this an + /// encoder that sets up input/output on the default stream and then launches + /// the norm/encode kernels on the caller's forked stream would race the setup + /// (the kernel reads stale/zero data). A default-stream synchronize after the + /// blocking copy restores the CUDA-equivalent ordering while preserving the + /// dual-stream copy/compute overlap (which uses async copies on explicit + /// streams, not these blocking paths). + fn sync_default_stream() -> Result<(), DriverError> { + unsafe { check(hipStreamSynchronize(std::ptr::null_mut())) } + } + + /// Marker: type is safe to byte-copy to/from the device. Mirrors + /// `cudarc::driver::DeviceRepr`. + /// + /// # Safety + /// Implementor must be `#[repr(C)]`/`#[repr(transparent)]` plain-old-data + /// with no padding that would expose uninitialized bytes. + pub unsafe trait DeviceRepr: Copy {} + unsafe impl DeviceRepr for f32 {} + unsafe impl DeviceRepr for f64 {} + unsafe impl DeviceRepr for i32 {} + unsafe impl DeviceRepr for u32 {} + unsafe impl DeviceRepr for usize {} + + /// Marker: an all-zero bit pattern is a valid value (enables alloc_zeros). + /// Mirrors `cudarc::driver::ValidAsZeroBits`. + /// + /// # Safety + /// All-zero bytes must be a valid inhabitant of the type. + pub unsafe trait ValidAsZeroBits {} + unsafe impl ValidAsZeroBits for f32 {} + unsafe impl ValidAsZeroBits for f64 {} + unsafe impl ValidAsZeroBits for i32 {} + unsafe impl ValidAsZeroBits for u32 {} + unsafe impl ValidAsZeroBits for usize {} + + /// Raw device-pointer accessors, matching cudarc's traits. The returned + /// reference is to the device address stored as `u64`, so the existing + /// `*slice.device_ptr() as *mut T` call sites work verbatim. + pub trait DevicePtr { + fn device_ptr(&self) -> &u64; + } + pub trait DevicePtrMut { + fn device_ptr_mut(&mut self) -> &mut u64; + } + /// Length accessor, matching cudarc's `DeviceSlice`. + pub trait DeviceSlice { + fn len(&self) -> usize; + fn is_empty(&self) -> bool { + self.len() == 0 + } + } + + /// Owned device allocation; frees on drop. Stand-in for `cudarc::CudaSlice`. + pub struct CudaSlice { + ptr: u64, + len: usize, + _device: Arc, + _marker: PhantomData, + } + + // The device address is just an integer; ownership/lifetime is enforced by + // the held Arc. Safe to move across threads like cudarc's slice. + unsafe impl Send for CudaSlice {} + unsafe impl Sync for CudaSlice {} + + impl CudaSlice { + fn raw_ptr(&self) -> *mut c_void { + self.ptr as *mut c_void + } + + /// Mutable sub-view `[range.start, range.end)`. Mirrors + /// `cudarc::CudaSlice::slice_mut`; the returned view borrows this slice + /// and is itself a `DevicePtrMut`/`DeviceSlice` copy target. + pub fn slice_mut(&mut self, range: std::ops::Range) -> CudaViewMut<'_, T> { + assert!( + range.start <= range.end && range.end <= self.len, + "slice_mut out of bounds" + ); + let offset_ptr = self.ptr + (range.start * std::mem::size_of::()) as u64; + CudaViewMut { + ptr: offset_ptr, + len: range.end - range.start, + _parent: PhantomData, + } + } + } + + /// Borrowed mutable view into a `CudaSlice`, returned by `slice_mut`. + pub struct CudaViewMut<'a, T> { + ptr: u64, + len: usize, + _parent: PhantomData<&'a mut T>, + } + + impl DevicePtr for CudaViewMut<'_, T> { + fn device_ptr(&self) -> &u64 { + &self.ptr + } + } + impl DevicePtrMut for CudaViewMut<'_, T> { + fn device_ptr_mut(&mut self) -> &mut u64 { + &mut self.ptr + } + } + impl DeviceSlice for CudaViewMut<'_, T> { + fn len(&self) -> usize { + self.len + } + } + + impl DevicePtr for CudaSlice { + fn device_ptr(&self) -> &u64 { + &self.ptr + } + } + impl DevicePtrMut for CudaSlice { + fn device_ptr_mut(&mut self) -> &mut u64 { + &mut self.ptr + } + } + impl DeviceSlice for CudaSlice { + fn len(&self) -> usize { + self.len + } + } + + impl Drop for CudaSlice { + fn drop(&mut self) { + if self.ptr != 0 { + // hipFree releases on the calling thread's current device, so + // re-bind the owning device first (cudarc does the same in Drop): + // on multi-GPU a different device may be current, which would + // otherwise free against the wrong device. Best-effort -- Drop + // cannot report an error, so a failed bind is swallowed. + let _ = self._device.bind(); + unsafe { + let _ = hipFree(self.raw_ptr()); + } + } + } + } + + /// A HIP stream. The public `stream` field mirrors cudarc's + /// `CudaStream { stream: sys::CUstream, .. }` so existing call sites that do + /// `ctx.stream_compute.stream as *mut c_void` keep working. + pub struct CudaStream { + pub stream: *mut c_void, + _device: Arc, + } + + unsafe impl Send for CudaStream {} + unsafe impl Sync for CudaStream {} + + impl Drop for CudaStream { + fn drop(&mut self) { + if !self.stream.is_null() { + unsafe { + let _ = hipStreamDestroy(self.stream); + } + } + } + } + + /// HIP device handle. Stand-in for `cudarc::CudaDevice`; created via + /// `CudaDevice::new(ordinal)` and shared as `Arc` exactly like + /// cudarc (whose `new` already returns the `Arc`). + pub struct CudaDevice { + ordinal: usize, + } + + impl CudaDevice { + /// Select device `ordinal` and return a shared handle, or an error if no + /// such device exists. Matches `cudarc::CudaDevice::new`. + pub fn new(ordinal: usize) -> Result, DriverError> { + unsafe { + let mut count: i32 = 0; + check(hipGetDeviceCount(&mut count))?; + if ordinal as i32 >= count { + return Err(DriverError(101)); // hipErrorInvalidDevice + } + check(hipSetDevice(ordinal as i32))?; + } + Ok(Arc::new(Self { ordinal })) + } + + pub fn ordinal(&self) -> usize { + self.ordinal + } + + fn bind(&self) -> Result<(), DriverError> { + unsafe { check(hipSetDevice(self.ordinal as i32)) } + } + + /// Allocate `len` uninitialized elements of `T` on the device. + /// + /// # Safety + /// Contents are uninitialized until written, mirroring `cudarc`'s + /// `unsafe fn alloc`. + pub unsafe fn alloc(self: &Arc, len: usize) -> Result, DriverError> { + self.bind()?; + let bytes = len.saturating_mul(std::mem::size_of::()); + let mut ptr: *mut c_void = std::ptr::null_mut(); + unsafe { + check(hipMalloc(&mut ptr, bytes.max(1)))?; + } + Ok(CudaSlice { + ptr: ptr as u64, + len, + _device: Arc::clone(self), + _marker: PhantomData, + }) + } + + /// Allocate `len` zero-initialized elements of `T` on the device. + pub fn alloc_zeros( + self: &Arc, + len: usize, + ) -> Result, DriverError> { + let slice = unsafe { self.alloc::(len)? }; + let bytes = len.saturating_mul(std::mem::size_of::()); + if bytes > 0 { + unsafe { + check(hipMemset(slice.raw_ptr(), 0, bytes))?; + sync_default_stream()?; + } + } + Ok(slice) + } + + /// Copy a host slice to a freshly allocated device buffer (blocking). + pub fn htod_sync_copy( + self: &Arc, + src: &[T], + ) -> Result, DriverError> { + let mut slice = unsafe { self.alloc::(src.len())? }; + self.htod_sync_copy_into(src, &mut slice)?; + Ok(slice) + } + + /// Copy an owned host Vec to a freshly allocated device buffer. cudarc's + /// `htod_copy` keeps the Vec alive until an async copy completes; our + /// copy is synchronous (blocking hipMemcpy), so the Vec can be dropped + /// on return with identical observable behavior. + pub fn htod_copy( + self: &Arc, + src: Vec, + ) -> Result, DriverError> { + self.htod_sync_copy(&src) + } + + /// Copy a host slice into an existing device buffer or sub-view + /// (blocking). Accepts any `DevicePtrMut` target so both `CudaSlice` and + /// the `slice_mut` view work, matching cudarc's generic destination. + pub fn htod_sync_copy_into + DeviceSlice>( + self: &Arc, + src: &[T], + dst: &mut D, + ) -> Result<(), DriverError> { + assert_eq!( + dst.len(), + src.len(), + "htod_sync_copy_into: dst.len() != src.len()" + ); + self.bind()?; + let bytes = std::mem::size_of_val(src); + if bytes > 0 { + unsafe { + check(hipMemcpy( + (*dst.device_ptr_mut()) as *mut c_void, + src.as_ptr() as *const c_void, + bytes, + HIP_MEMCPY_HOST_TO_DEVICE, + ))?; + sync_default_stream()?; + } + } + Ok(()) + } + + /// Copy a device buffer back to a freshly allocated host Vec (blocking). + /// + /// Matches cudarc's bound of just `DeviceRepr` (no `Default`): the Vec is + /// allocated uninitialized and every byte is written by the copy before + /// its length is set, which is sound because `DeviceRepr` is plain data. + pub fn dtoh_sync_copy( + self: &Arc, + src: &CudaSlice, + ) -> Result, DriverError> { + self.bind()?; + let len = src.len; + let mut out: Vec = Vec::with_capacity(len); + let bytes = len.saturating_mul(std::mem::size_of::()); + if bytes > 0 { + unsafe { + check(hipMemcpy( + out.as_mut_ptr() as *mut c_void, + src.raw_ptr() as *const c_void, + bytes, + HIP_MEMCPY_DEVICE_TO_HOST, + ))?; + } + } + unsafe { + out.set_len(len); + } + Ok(out) + } + + /// Block until all work on the device's default stream completes. + pub fn synchronize(&self) -> Result<(), DriverError> { + self.bind()?; + unsafe { check(hipDeviceSynchronize()) } + } + + /// Create a new stream tied to this device. Mirrors + /// `cudarc::CudaDevice::fork_default_stream`. + pub fn fork_default_stream(self: &Arc) -> Result { + self.bind()?; + let mut stream: *mut c_void = std::ptr::null_mut(); + unsafe { + // Non-blocking stream (matches cudarc on both Linux and Windows): a + // default/blocking stream would serialize H2D copies against the NULL + // stream and defeat the dual-stream copy/compute overlap the pipeline + // relies on. Because a non-blocking stream does not implicitly order + // against the default stream, any host-visible readback (a default/NULL + // stream dtoh copy) of results produced on this stream MUST first + // synchronize this stream (wait_for / sync_cuda_stream); the encoders + // already do so before every readback. + check(hipStreamCreateWithFlags( + &mut stream, + HIP_STREAM_NON_BLOCKING, + ))?; + } + Ok(CudaStream { + stream, + _device: Arc::clone(self), + }) + } + + /// Block until all work on `stream` completes. Mirrors + /// `cudarc::CudaDevice::wait_for`. + pub fn wait_for(&self, stream: &CudaStream) -> Result<(), DriverError> { + unsafe { check(hipStreamSynchronize(stream.stream)) } + } + } +} diff --git a/qdp/qdp-kernels/src/kernel_compat.h b/qdp/qdp-kernels/src/kernel_compat.h new file mode 100644 index 0000000000..6720705d06 --- /dev/null +++ b/qdp/qdp-kernels/src/kernel_compat.h @@ -0,0 +1,35 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Cross-vendor kernel compatibility shims. +// +// Included by the kernel TUs that use warp intrinsics (amplitude.cu). On CUDA +// it is inert. On HIP it supplies the one warp-intrinsic difference that does +// not translate 1:1: the full-warp lane mask for __shfl_*_sync. + +#ifndef KERNEL_COMPAT_H +#define KERNEL_COMPAT_H + +#if defined(__HIP_PLATFORM_AMD__) +// ROCm's __shfl_*_sync static_asserts a 64-bit mask (sizeof(MaskT) == 8): the +// 32-bit literal 0xffffffff every CUDA warp-sync uses fails to COMPILE, +// independent of the active wave width. Use an all-lanes 64-bit mask. +#define QDP_FULL_WARP_MASK 0xffffffffffffffffULL +#else +#define QDP_FULL_WARP_MASK 0xffffffffu +#endif + +#endif // KERNEL_COMPAT_H diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs index 9a1f832d85..f44a15ce4e 100644 --- a/qdp/qdp-kernels/src/lib.rs +++ b/qdp/qdp-kernels/src/lib.rs @@ -20,6 +20,9 @@ use std::ffi::c_void; +pub mod device; +use device::{DeviceRepr, ValidAsZeroBits}; + // Complex number (matches CUDA's cuDoubleComplex) #[repr(C)] #[derive(Debug, Clone, Copy)] @@ -28,13 +31,12 @@ pub struct CuDoubleComplex { pub y: f64, // Imaginary part } -// Implement DeviceRepr for cudarc compatibility -#[cfg(target_os = "linux")] -unsafe impl cudarc::driver::DeviceRepr for CuDoubleComplex {} - -// Also implement ValidAsZeroBits for alloc_zeros support -#[cfg(target_os = "linux")] -unsafe impl cudarc::driver::ValidAsZeroBits for CuDoubleComplex {} +// Device-transferable + zero-initializable markers. On CUDA these resolve to +// cudarc's traits; on HIP they resolve to the in-crate `device` shim's traits. +#[cfg(qdp_gpu_platform)] +unsafe impl DeviceRepr for CuDoubleComplex {} +#[cfg(qdp_gpu_platform)] +unsafe impl ValidAsZeroBits for CuDoubleComplex {} // Complex number (matches CUDA's cuComplex / cuFloatComplex) #[repr(C)] @@ -44,16 +46,13 @@ pub struct CuComplex { pub y: f32, // Imaginary part } -// Implement DeviceRepr for cudarc compatibility -#[cfg(target_os = "linux")] -unsafe impl cudarc::driver::DeviceRepr for CuComplex {} - -// Also implement ValidAsZeroBits for alloc_zeros support -#[cfg(target_os = "linux")] -unsafe impl cudarc::driver::ValidAsZeroBits for CuComplex {} +#[cfg(qdp_gpu_platform)] +unsafe impl DeviceRepr for CuComplex {} +#[cfg(qdp_gpu_platform)] +unsafe impl ValidAsZeroBits for CuComplex {} // CUDA kernel FFI (Linux only; stubbed when built without nvcc/CUDA) -#[cfg(all(target_os = "linux", not(qdp_no_cuda)))] +#[cfg(all(qdp_gpu_platform, not(qdp_no_cuda)))] unsafe extern "C" { /// Launch amplitude encoding kernel /// Returns CUDA error code (0 = success) @@ -405,7 +404,7 @@ unsafe extern "C" { } // Dummy implementation for non-Linux and Linux builds without CUDA (allows linking) -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_amplitude_encode( _input_d: *const f64, @@ -418,7 +417,7 @@ pub extern "C" fn launch_amplitude_encode( 999 // Error: CUDA unavailable } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_amplitude_encode_f32( _input_d: *const f32, @@ -431,7 +430,7 @@ pub extern "C" fn launch_amplitude_encode_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_amplitude_encode_batch( _input_batch_d: *const f64, @@ -445,7 +444,7 @@ pub extern "C" fn launch_amplitude_encode_batch( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_amplitude_encode_batch_f32( _input_batch_d: *const f32, @@ -459,7 +458,7 @@ pub extern "C" fn launch_amplitude_encode_batch_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_l2_norm( _input_d: *const f64, @@ -470,7 +469,7 @@ pub extern "C" fn launch_l2_norm( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_l2_norm_batch( _input_batch_d: *const f64, @@ -482,7 +481,7 @@ pub extern "C" fn launch_l2_norm_batch( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_l2_norm_f32( _input_d: *const f32, @@ -493,7 +492,7 @@ pub extern "C" fn launch_l2_norm_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_l2_norm_batch_f32( _input_batch_d: *const f32, @@ -505,7 +504,7 @@ pub extern "C" fn launch_l2_norm_batch_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn convert_state_to_float( _input_state_d: *const CuDoubleComplex, @@ -516,7 +515,7 @@ pub extern "C" fn convert_state_to_float( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn convert_state_to_double( _input_state_d: *const CuComplex, @@ -527,7 +526,7 @@ pub extern "C" fn convert_state_to_double( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_basis_encode( _basis_index: usize, @@ -538,7 +537,7 @@ pub extern "C" fn launch_basis_encode( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_basis_encode_batch( _basis_indices_d: *const usize, @@ -551,7 +550,7 @@ pub extern "C" fn launch_basis_encode_batch( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_basis_encode_f32( _basis_index: usize, @@ -562,7 +561,7 @@ pub extern "C" fn launch_basis_encode_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_basis_encode_batch_f32( _basis_indices_d: *const usize, @@ -575,7 +574,7 @@ pub extern "C" fn launch_basis_encode_batch_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_angle_encode( _angles_d: *const f64, @@ -587,7 +586,7 @@ pub extern "C" fn launch_angle_encode( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_angle_encode_batch( _angles_batch_d: *const f64, @@ -600,7 +599,7 @@ pub extern "C" fn launch_angle_encode_batch( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_angle_encode_f32( _angles_d: *const f32, @@ -612,7 +611,7 @@ pub extern "C" fn launch_angle_encode_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_angle_encode_batch_f32( _angles_batch_d: *const f32, @@ -625,7 +624,7 @@ pub extern "C" fn launch_angle_encode_batch_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_check_finite_batch_f32( _input_batch_d: *const f32, @@ -636,7 +635,7 @@ pub extern "C" fn launch_check_finite_batch_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_check_finite_batch_f64( _input_batch_d: *const f64, @@ -647,7 +646,7 @@ pub extern "C" fn launch_check_finite_batch_f64( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_validate_and_cast_basis_indices_f32( _input_batch_d: *const f32, @@ -660,7 +659,7 @@ pub extern "C" fn launch_validate_and_cast_basis_indices_f32( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_check_basis_indices_usize( _indices_d: *const usize, @@ -672,7 +671,7 @@ pub extern "C" fn launch_check_basis_indices_usize( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_iqp_encode( _data_d: *const f64, @@ -685,7 +684,7 @@ pub extern "C" fn launch_iqp_encode( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_iqp_encode_batch( _data_batch_d: *const f64, @@ -700,7 +699,7 @@ pub extern "C" fn launch_iqp_encode_batch( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_phase_encode( _phases_d: *const f64, @@ -712,7 +711,7 @@ pub extern "C" fn launch_phase_encode( 999 } -#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_phase_encode_batch( _phases_batch_d: *const f64, diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs b/qdp/qdp-kernels/tests/amplitude_encode.rs index 53f91505f5..8a413fa9e8 100644 --- a/qdp/qdp-kernels/tests/amplitude_encode.rs +++ b/qdp/qdp-kernels/tests/amplitude_encode.rs @@ -20,10 +20,14 @@ // but in CUDA builds, the extern "C" functions require unsafe blocks. // The compiler can't statically determine which path is taken. #![allow(unused_unsafe)] - -#[cfg(target_os = "linux")] -use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut}; -#[cfg(target_os = "linux")] +// The `stream.stream as *mut c_void` cast is a real conversion on the CUDA +// backend (cudarc's CUstream is a distinct pointer type) but a no-op on HIP, +// where the stream field is already *mut c_void; silence the HIP-only redundancy. +#![cfg_attr(feature = "hip", allow(clippy::unnecessary_cast))] + +#[cfg(qdp_gpu_platform)] +use qdp_kernels::device::{CudaDevice, DevicePtr, DevicePtrMut}; +#[cfg(qdp_gpu_platform)] use qdp_kernels::{ CuComplex, CuDoubleComplex, launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_f32, launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, @@ -33,7 +37,7 @@ use qdp_kernels::{ const EPSILON: f64 = 1e-10; const EPSILON_F32: f32 = 1e-5; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn assert_batch_state_matches_f64( state_h: &[CuDoubleComplex], input: &[f64], @@ -70,7 +74,7 @@ fn assert_batch_state_matches_f64( } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_basic() { println!("Testing basic amplitude encoding..."); @@ -141,7 +145,7 @@ fn test_amplitude_encode_basic() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_basic_f32() { println!("Testing basic amplitude encoding (float32)..."); @@ -211,7 +215,7 @@ fn test_amplitude_encode_basic_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_power_of_two() { println!("Testing amplitude encoding with power-of-two input..."); @@ -271,7 +275,7 @@ fn test_amplitude_encode_power_of_two() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_odd_input_length() { println!("Testing amplitude encoding with odd input length..."); @@ -320,7 +324,7 @@ fn test_amplitude_encode_odd_input_length() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_odd_input_length_f32() { println!("Testing amplitude encoding with odd input length (float32)..."); @@ -369,7 +373,7 @@ fn test_amplitude_encode_odd_input_length_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_large_state() { println!("Testing amplitude encoding with large state vector..."); @@ -427,7 +431,7 @@ fn test_amplitude_encode_large_state() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_zero_norm_error() { println!("Testing amplitude encoding with zero norm (error case)..."); @@ -467,7 +471,7 @@ fn test_amplitude_encode_zero_norm_error() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_negative_norm_error() { println!("Testing amplitude encoding with negative norm (error case)..."); @@ -507,7 +511,7 @@ fn test_amplitude_encode_negative_norm_error() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_vectorized_load() { println!("Testing vectorized double2 memory access optimization..."); @@ -559,7 +563,7 @@ fn test_amplitude_encode_vectorized_load() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_small_input_large_state() { println!("Testing small input with large state vector..."); @@ -612,7 +616,7 @@ fn test_amplitude_encode_small_input_large_state() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_amplitude_encode_batch_odd_sample_length_handles_misaligned_samples() { println!("Testing batch amplitude encoding with odd sample length (float64)..."); @@ -658,7 +662,7 @@ fn test_amplitude_encode_batch_odd_sample_length_handles_misaligned_samples() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_single_kernel() { println!("Testing single-vector GPU norm reduction..."); @@ -698,7 +702,7 @@ fn test_l2_norm_single_kernel() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_stream() { println!("Testing batched norm reduction on async stream..."); @@ -755,7 +759,7 @@ fn test_l2_norm_batch_kernel_stream() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_odd_sample_len() { println!("Testing batched L2 norm reduction with odd sample length (float64)..."); @@ -804,7 +808,7 @@ fn test_l2_norm_batch_kernel_odd_sample_len() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_zero_num_samples() { println!("Testing batched L2 norm rejection when num_samples==0 (float64)..."); @@ -838,7 +842,7 @@ fn test_l2_norm_batch_kernel_zero_num_samples() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_zero_sample_len() { println!("Testing batched L2 norm rejection when sample_len==0 (float64)..."); @@ -872,7 +876,7 @@ fn test_l2_norm_batch_kernel_zero_sample_len() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_single_kernel_f32() { println!("Testing L2 norm reduction kernel (float32)..."); @@ -918,7 +922,7 @@ fn test_l2_norm_single_kernel_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_f32() { println!("Testing batched L2 norm reduction kernel (float32)..."); @@ -973,7 +977,7 @@ fn test_l2_norm_batch_kernel_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_f32_odd_sample_len() { println!("Testing batched L2 norm reduction with odd sample length (float32)..."); @@ -1022,7 +1026,7 @@ fn test_l2_norm_batch_kernel_f32_odd_sample_len() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_zero_num_samples_f32() { println!("Testing batched L2 norm rejection when num_samples==0 (float32)..."); @@ -1056,7 +1060,7 @@ fn test_l2_norm_batch_kernel_zero_num_samples_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_l2_norm_batch_kernel_zero_sample_len_f32() { println!("Testing batched L2 norm rejection when sample_len==0 (float32)..."); @@ -1090,7 +1094,7 @@ fn test_l2_norm_batch_kernel_zero_sample_len_f32() { } #[test] -#[cfg(not(target_os = "linux"))] +#[cfg(not(qdp_gpu_platform))] fn test_amplitude_encode_dummy_non_linux() { println!("Testing dummy implementation on non-Linux platform..."); diff --git a/qdp/qdp-kernels/tests/angle_encode.rs b/qdp/qdp-kernels/tests/angle_encode.rs index a4af609af1..72a659f581 100644 --- a/qdp/qdp-kernels/tests/angle_encode.rs +++ b/qdp/qdp-kernels/tests/angle_encode.rs @@ -18,9 +18,9 @@ #![allow(unused_unsafe)] -#[cfg(target_os = "linux")] -use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] +use qdp_kernels::device::{CudaDevice, DevicePtr, DevicePtrMut}; +#[cfg(qdp_gpu_platform)] use qdp_kernels::{ CuComplex, launch_angle_encode_batch_f32, launch_angle_encode_f32, launch_check_finite_batch_f32, @@ -28,7 +28,7 @@ use qdp_kernels::{ const EPSILON_F32: f32 = 1e-5; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn expected_amplitude_f32(angles: &[f32], basis_idx: usize) -> f32 { angles.iter().enumerate().fold(1.0f32, |acc, (bit, angle)| { let factor = if ((basis_idx >> bit) & 1) == 1 { @@ -41,7 +41,7 @@ fn expected_amplitude_f32(angles: &[f32], basis_idx: usize) -> f32 { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_basic_f32() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -86,7 +86,7 @@ fn test_angle_encode_basic_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_matches_expected_product_state_f32() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -136,7 +136,7 @@ fn test_angle_encode_matches_expected_product_state_f32() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_f32_rejects_zero_qubits() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -164,7 +164,7 @@ fn test_angle_encode_f32_rejects_zero_qubits() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_batch_f32_matches_expected_product_states() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -224,7 +224,7 @@ fn test_angle_encode_batch_f32_matches_expected_product_states() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_batch_f32_rejects_zero_samples() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -252,7 +252,7 @@ fn test_angle_encode_batch_f32_rejects_zero_samples() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_check_finite_batch_f32_reports_non_finite() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -282,7 +282,7 @@ fn test_check_finite_batch_f32_reports_non_finite() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_check_finite_batch_f32_reports_nan() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -312,7 +312,7 @@ fn test_check_finite_batch_f32_reports_nan() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_check_finite_batch_f32_all_finite_stays_clear() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -342,7 +342,7 @@ fn test_check_finite_batch_f32_all_finite_stays_clear() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_batch_f32_rejects_zero_state_len() { let device = match CudaDevice::new(0) { Ok(d) => d, @@ -370,7 +370,7 @@ fn test_angle_encode_batch_f32_rejects_zero_state_len() { } #[test] -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] fn test_angle_encode_batch_f32_rejects_zero_qubits() { let device = match CudaDevice::new(0) { Ok(d) => d, diff --git a/qdp/qdp-python/Cargo.toml b/qdp/qdp-python/Cargo.toml index fbe46a0198..2dbf659ec2 100644 --- a/qdp/qdp-python/Cargo.toml +++ b/qdp/qdp-python/Cargo.toml @@ -10,11 +10,15 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.27" } numpy = "0.27" -qdp-core = { path = "../qdp-core" } +qdp-core = { path = "../qdp-core", default-features = false } env_logger = "0.11" [features] -default = [] +# NVIDIA CUDA backend (default); pass through to qdp-core. +default = ["cuda"] +cuda = ["qdp-core/cuda"] +# AMD HIP backend; pass through to qdp-core. +hip = ["qdp-core/hip"] observability = ["qdp-core/observability"] pytorch = ["qdp-core/pytorch"] remote-io = ["qdp-core/remote-io"] diff --git a/qdp/qdp-python/README.md b/qdp/qdp-python/README.md index a81f95324d..d770cc9774 100644 --- a/qdp/qdp-python/README.md +++ b/qdp/qdp-python/README.md @@ -64,6 +64,13 @@ The public `QdpEngine` is a unified Python facade with explicit backend selectio - `backend="cuda"` routes to the Rust `_qdp.QdpEngine` - `backend="amd"` routes to the Triton AMD engine directly +The native Rust `_qdp.QdpEngine` (the `backend="cuda"` route) also runs on AMD +GPUs when the extension is built from source with the `hip` Cargo feature, which +compiles the same kernels with `hipcc` and binds the AMD HIP runtime. This gives +the native pipeline (pinned-buffer pool, dual-stream overlap, in-Rust DLPack +ownership) on AMD without the Triton path. See `qdp/DEVELOPMENT.md` for the +ROCm/HIP build steps. + See `qdp/qdp-python/TRITON_AMD_BACKEND.md` for Triton AMD setup and validation details. ## Encoding Methods diff --git a/qdp/qdp-python/build.rs b/qdp/qdp-python/build.rs new file mode 100644 index 0000000000..8a133e62ab --- /dev/null +++ b/qdp/qdp-python/build.rs @@ -0,0 +1,27 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +fn main() { + // Emit qdp_gpu_platform when building for a GPU-capable OS (Linux always; + // Windows when the hip feature is on via QDP_USE_HIP=1 / TheRock ROCm). + println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)"); + let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux"); + let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok(); + let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows"); + if is_linux || (is_windows && hip_feature) { + println!("cargo::rustc-cfg=qdp_gpu_platform"); + } +} diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs index 8297bf5e78..2c44608f99 100644 --- a/qdp/qdp-python/src/engine.rs +++ b/qdp/qdp-python/src/engine.rs @@ -24,7 +24,7 @@ use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; use qdp_core::{Dtype, Encoding, QdpEngine as CoreEngine}; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use crate::loader::{PyQuantumLoader, config_from_args, parse_null_handling, path_from_py}; /// PyO3 wrapper for QdpEngine @@ -522,7 +522,7 @@ impl QdpEngine { } // --- Loader factory methods (Linux only) --- - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] /// Create a synthetic-data pipeline iterator (for QuantumDataLoader.source_synthetic()). #[pyo3(signature = (total_batches, batch_size, num_qubits, encoding_method, seed=None, null_handling=None))] fn create_synthetic_loader( @@ -551,7 +551,7 @@ impl QdpEngine { Ok(PyQuantumLoader::new(Some(iter))) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] /// Create a file-backed pipeline iterator (full read then batch; for QuantumDataLoader.source_file(path)). #[allow(clippy::too_many_arguments)] #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, batch_limit=None, null_handling=None))] @@ -600,7 +600,7 @@ impl QdpEngine { Ok(PyQuantumLoader::new(Some(iter))) } - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] /// Create a streaming Parquet pipeline iterator (for QuantumDataLoader.source_file(path, streaming=True)). #[allow(clippy::too_many_arguments)] #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, batch_limit=None, null_handling=None))] diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs index 04d772a906..cd7f19200f 100644 --- a/qdp/qdp-python/src/lib.rs +++ b/qdp/qdp-python/src/lib.rs @@ -26,10 +26,10 @@ use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; use tensor::QuantumTensor; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] use loader::PyQuantumLoader; -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] #[pyfunction] #[pyo3(signature = (device_id, num_qubits, batch_size, total_batches, encoding_method, warmup_batches=0, seed=None, dtype="f64"))] #[allow(clippy::too_many_arguments)] @@ -78,9 +78,9 @@ fn _qdp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] m.add_class::()?; - #[cfg(target_os = "linux")] + #[cfg(qdp_gpu_platform)] m.add_function(wrap_pyfunction!(run_throughput_pipeline_py, m)?)?; Ok(()) } diff --git a/qdp/qdp-python/src/loader.rs b/qdp/qdp-python/src/loader.rs index a43f947944..46e5b14642 100644 --- a/qdp/qdp-python/src/loader.rs +++ b/qdp/qdp-python/src/loader.rs @@ -15,7 +15,7 @@ // limitations under the License. // Loader bindings (Linux only; qdp-core pipeline types only built on Linux) -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] mod loader_impl { use crate::tensor::QuantumTensor; use pyo3::exceptions::PyRuntimeError; @@ -120,5 +120,5 @@ mod loader_impl { } } -#[cfg(target_os = "linux")] +#[cfg(qdp_gpu_platform)] pub use loader_impl::{PyQuantumLoader, config_from_args, parse_null_handling, path_from_py}; diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py index eb5d92ce9b..0c8a985b88 100644 --- a/testing/qdp/test_bindings.py +++ b/testing/qdp/test_bindings.py @@ -122,7 +122,14 @@ def test_dlpack_device(): qtensor = engine.encode(data, 2, "amplitude") device_info = qtensor.__dlpack_device__() - assert device_info == (2, 0), "Expected (2, 0) for CUDA device 0" + # DLPack device_type: kDLCUDA=2 (NVIDIA), kDLROCM=10 (AMD HIP). The native + # engine tags exported tensors to match the backend it was built for, so a + # ROCm build must report kDLROCM (which a ROCm PyTorch's from_dlpack + # requires); a CUDA build reports kDLCUDA. + expected_device_type = 10 if getattr(torch.version, "hip", None) else 2 + assert device_info == (expected_device_type, 0), ( + f"Expected ({expected_device_type}, 0) for device 0" + ) @requires_qdp @@ -142,10 +149,12 @@ def test_dlpack_device_id_non_zero(): qtensor = engine.encode(data, 2, "amplitude") device_info = qtensor.__dlpack_device__() + # DLPack device_type: kDLCUDA=2 (NVIDIA), kDLROCM=10 (AMD HIP). + expected_device_type = 10 if getattr(torch.version, "hip", None) else 2 assert device_info == ( - 2, + expected_device_type, device_id, - ), f"Expected (2, {device_id}) for CUDA device {device_id}" + ), f"Expected ({expected_device_type}, {device_id}) for device {device_id}" # Verify PyTorch integration works with non-zero device_id torch_tensor = torch.from_dlpack(qtensor)