apache · jeffdaily · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
@@ -3,3 +3,8 @@ Copyright 2009-2026 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (https://www.apache.org/).
+
+This product includes software developed by
+Advanced Micro Devices, Inc. (https://www.amd.com/).
+Copyright (c) 2026 Advanced Micro Devices, Inc.
+The AMD/HIP GPU build of the QDP module is an AMD contribution.
diff --git a/qdp/DEVELOPMENT.md b/qdp/DEVELOPMENT.md
@@ -114,6 +114,46 @@ cd ..
 The first command is what `maturin develop --release` runs on CI; the
 second verifies tests type-check in the CUDA build.
 
+### AMD GPU build (ROCm / HIP)
+
+The native engine also builds for AMD GPUs by compiling the same six `.cu`
+kernels with `hipcc` and binding the AMD HIP runtime instead of CUDA. This is
+opt-in behind the `hip` Cargo feature; the default build is the unchanged CUDA
+path, so nothing here affects an NVIDIA build.
+
+Prerequisites:
+
+- Linux or Windows + an AMD GPU (CDNA gfx90a or RDNA gfx11xx/gfx12xx)
+- ROCm >= 6.0 with `hipcc` and the AMD HIP runtime (`amdhip64`); on Windows a
+  TheRock-based ROCm from the `rocm-sdk` pip wheels also works. ROCm 6.0 is the
+  floor because the device-pointer check in `qdp-core/src/gpu/cuda_ffi.rs` uses
+  the ROCm 6+ `hipMemoryType` device convention (ROCm 5.x numbered Device=1 and
+  would reject valid device pointers); tested on ROCm 7.2.1
+- a ROCm build of PyTorch in the venv for the DLPack interop tests
+
+Build the Rust core and kernels for AMD. `QDP_USE_HIP=1` selects the HIP branch
+in `build.rs`, and `QDP_HIP_ARCH_LIST` picks the target arch(es) (defaults to
+`gfx90a` only when unset; set it to your GPU, e.g. `gfx1100`):
+
+```bash
+cd qdp
+export QDP_USE_HIP=1 QDP_HIP_ARCH_LIST=gfx90a ROCM_PATH=/opt/rocm
+cargo build -p qdp-core -p qdp-kernels --no-default-features --features hip
+cargo test  -p qdp-core -p qdp-kernels --no-default-features --features hip -- --test-threads=1
+cd ..
+```
+
+Build the Python extension with the `hip` feature. Use `--profile dev` (the
+release `lto = "fat"` profile produces a bitcode-only cdylib under the HIP
+toolchain), and install only the extension so a working ROCm PyTorch in the venv
+is not replaced:
+
+```bash
+maturin build --features hip --profile dev \
+  --manifest-path qdp/qdp-python/Cargo.toml --out dist/
+pip install --no-deps --force-reinstall dist/qumat_qdp-*.whl
+```
+
 ## 4. Benchmarks
 
 From the repo root, set up and prepare benchmarks:

@@ -4,8 +4,8 @@ version.workspace = true
 edition.workspace = true
 
 [dependencies]
-cudarc = { workspace = true }
-qdp-kernels = { path = "../qdp-kernels" }
+cudarc = { workspace = true, optional = true }
+qdp-kernels = { path = "../qdp-kernels", default-features = false }
 thiserror = { workspace = true }
 rayon = { workspace = true }
 nvtx = { version = "1.3", optional = true }
@@ -30,7 +30,14 @@ protoc-bin-vendored = { workspace = true }
 name = "qdp_core"
 
 [features]
-default = []
+# `cuda` and `hip` are mutually exclusive; pick exactly one vendor backend.
+# If both end up enabled (e.g. via workspace feature unification), `hip` takes
+# precedence: kernels build for HIP and cudarc is compiled but unused.
+# NVIDIA CUDA via cudarc + nvcc kernels (default, unchanged behavior).
+default = ["cuda"]
+cuda = ["dep:cudarc", "qdp-kernels/cuda"]
+# AMD HIP: hipcc kernels + the qdp-kernels device shim; no cudarc.
+hip = ["qdp-kernels/hip"]
 observability = ["nvtx"]
 pytorch = ["tch"]
 remote-io = ["object_store", "tokio", "tempfile", "futures"]

@@ -15,6 +15,19 @@
 // limitations under the License.
 
 fn main() {
+    // Emit qdp_gpu_platform cfg on any OS where the GPU stack is compiled.
+    // Linux always has it (original target). Windows gets it when the `hip`
+    // feature is active (TheRock-based ROCm; the feature is set by QDP_USE_HIP=1).
+    // Source code that was `#[cfg(target_os = "linux")]` should use
+    // `#[cfg(qdp_gpu_platform)]` so it compiles on both.
+    println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)");
+    let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux");
+    let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok();
+    let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows");
+    if is_linux || (is_windows && hip_feature) {
+        println!("cargo::rustc-cfg=qdp_gpu_platform");
+    }
+
     // Use vendored protoc to avoid missing protoc in CI/dev environments
     unsafe {
         std::env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap());

@@ -16,14 +16,14 @@
 
 // DLPack protocol for zero-copy GPU memory sharing with PyTorch
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{BufferStorage, GpuDeviceType, GpuStateVector, Precision};
 use std::os::raw::{c_int, c_void};
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_ffi::{
     CUDA_EVENT_DISABLE_TIMING, cudaEventCreateWithFlags, cudaEventDestroy, cudaEventRecord,
     cudaStreamWaitEvent,
@@ -45,7 +45,7 @@ pub fn dlpack_stream_to_cuda(stream: i64) -> *mut c_void {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 /// # Safety
 /// `stream` must be a valid CUDA stream pointer or one of the CUDA sentinel
 /// values (legacy/per-thread default). Passing any other pointer is undefined.
@@ -96,7 +96,7 @@ pub unsafe fn synchronize_stream(stream: *mut c_void) -> Result<()> {
     Ok(())
 }
 
-#[cfg(not(target_os = "linux"))]
+#[cfg(not(qdp_gpu_platform))]
 /// # Safety
 /// No-op on non-Linux targets, kept unsafe to match the Linux signature.
 pub unsafe fn synchronize_stream(_stream: *mut c_void) -> Result<()> {

@@ -22,7 +22,7 @@
 
 use std::ffi::c_void;
 
-use cudarc::driver::{CudaSlice, DevicePtrMut};
+use crate::gpu_rt::{CudaSlice, DevicePtrMut};
 use qdp_kernels::{launch_amplitude_encode_batch, launch_l2_norm_batch};
 
 use super::{ChunkEncoder, STAGE_SIZE_ELEMENTS};
@@ -135,7 +135,7 @@ impl ChunkEncoder for AmplitudeEncoder {
 mod tests {
     use super::*;
     use crate::MahoutError;
-    use cudarc::driver::DeviceSlice;
+    use crate::gpu_rt::DeviceSlice;
 
     #[test]
     fn reject_sample_size_zero() {

@@ -130,9 +130,9 @@ mod tests {
     use crate::encoding::STAGE_SIZE_ELEMENTS;
     // chunk-size overflow checks
     #[test]
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn test_encode_chunk_overflow() {
-        use cudarc::driver::CudaDevice;
+        use crate::gpu_rt::CudaDevice;
         use std::sync::Arc;
 
         let device: Arc<CudaDevice> = match CudaDevice::new(0) {

@@ -22,7 +22,7 @@
 
 use std::ffi::c_void;
 
-use cudarc::driver::{CudaSlice, DevicePtr};
+use crate::gpu_rt::{CudaSlice, DevicePtr};
 use qdp_kernels::launch_basis_encode_batch;
 
 use super::{ChunkEncoder, STAGE_SIZE_ELEMENTS};

@@ -25,7 +25,7 @@ use std::sync::Arc;
 use std::sync::mpsc::{Receiver, SyncSender, sync_channel};
 use std::thread::{self, JoinHandle};
 
-use cudarc::driver::{CudaDevice, DevicePtr};
+use crate::gpu_rt::{CudaDevice, DevicePtr};
 
 /// Guard that ensures GPU synchronization and IO thread cleanup on drop.
 /// Used to handle early returns in `stream_encode`.

@@ -53,7 +53,7 @@ pub enum MahoutError {
 pub type Result<T> = std::result::Result<T, MahoutError>;
 
 /// Convert CUDA error code to human-readable string
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub fn cuda_error_to_string(code: i32) -> &'static str {
     match code {
         0 => "cudaSuccess",

@@ -23,17 +23,17 @@ use std::time::Instant;
 
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::PinnedHostBuffer;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::pool_metrics::PoolMetrics;
 
 /// Handle that automatically returns a buffer to the pool on drop.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub struct PinnedBufferHandle<T: Copy = f64> {
     buffer: Option<PinnedHostBuffer<T>>,
     pool: Arc<PinnedBufferPool<T>>,
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> std::ops::Deref for PinnedBufferHandle<T> {
     type Target = PinnedHostBuffer<T>;
 
@@ -44,7 +44,7 @@ impl<T: Copy> std::ops::Deref for PinnedBufferHandle<T> {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> std::ops::DerefMut for PinnedBufferHandle<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.buffer
@@ -53,7 +53,7 @@ impl<T: Copy> std::ops::DerefMut for PinnedBufferHandle<T> {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> Drop for PinnedBufferHandle<T> {
     fn drop(&mut self) {
         if let Some(buf) = self.buffer.take() {
@@ -65,15 +65,15 @@ impl<T: Copy> Drop for PinnedBufferHandle<T> {
 }
 
 /// Pool of pinned host buffers sized for a fixed batch shape.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub struct PinnedBufferPool<T: Copy = f64> {
     free: Mutex<Vec<PinnedHostBuffer<T>>>,
     available_cv: Condvar,
     capacity: usize,
     elements_per_buffer: usize,
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> PinnedBufferPool<T> {
     /// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` values of `T`.
     pub fn new(pool_size: usize, elements_per_buffer: usize) -> Result<Arc<Self>> {