diff --git a/NOTICE b/NOTICE
index 7018a3bab2..809f20f16f 100644
--- a/NOTICE
+++ b/NOTICE
@@ -3,3 +3,8 @@ Copyright 2009-2026 The Apache Software Foundation
 
 This product includes software developed at
 The Apache Software Foundation (https://www.apache.org/).
+
+This product includes software developed by
+Advanced Micro Devices, Inc. (https://www.amd.com/).
+Copyright (c) 2026 Advanced Micro Devices, Inc.
+The AMD/HIP GPU build of the QDP module is an AMD contribution.
diff --git a/qdp/DEVELOPMENT.md b/qdp/DEVELOPMENT.md
index d6c3e6ee98..8cfe2cd077 100644
--- a/qdp/DEVELOPMENT.md
+++ b/qdp/DEVELOPMENT.md
@@ -114,6 +114,46 @@ cd ..
 The first command is what `maturin develop --release` runs on CI; the
 second verifies tests type-check in the CUDA build.
 
+### AMD GPU build (ROCm / HIP)
+
+The native engine also builds for AMD GPUs by compiling the same six `.cu`
+kernels with `hipcc` and binding the AMD HIP runtime instead of CUDA. This is
+opt-in behind the `hip` Cargo feature; the default build is the unchanged CUDA
+path, so nothing here affects an NVIDIA build.
+
+Prerequisites:
+
+- Linux or Windows + an AMD GPU (CDNA gfx90a or RDNA gfx11xx/gfx12xx)
+- ROCm >= 6.0 with `hipcc` and the AMD HIP runtime (`amdhip64`); on Windows a
+  TheRock-based ROCm from the `rocm-sdk` pip wheels also works. ROCm 6.0 is the
+  floor because the device-pointer check in `qdp-core/src/gpu/cuda_ffi.rs` uses
+  the ROCm 6+ `hipMemoryType` device convention (ROCm 5.x numbered Device=1 and
+  would reject valid device pointers); tested on ROCm 7.2.1
+- a ROCm build of PyTorch in the venv for the DLPack interop tests
+
+Build the Rust core and kernels for AMD. `QDP_USE_HIP=1` selects the HIP branch
+in `build.rs`, and `QDP_HIP_ARCH_LIST` picks the target arch(es) (defaults to
+`gfx90a` only when unset; set it to your GPU, e.g. `gfx1100`):
+
+```bash
+cd qdp
+export QDP_USE_HIP=1 QDP_HIP_ARCH_LIST=gfx90a ROCM_PATH=/opt/rocm
+cargo build -p qdp-core -p qdp-kernels --no-default-features --features hip
+cargo test  -p qdp-core -p qdp-kernels --no-default-features --features hip -- --test-threads=1
+cd ..
+```
+
+Build the Python extension with the `hip` feature. Use `--profile dev` (the
+release `lto = "fat"` profile produces a bitcode-only cdylib under the HIP
+toolchain), and install only the extension so a working ROCm PyTorch in the venv
+is not replaced:
+
+```bash
+maturin build --features hip --profile dev \
+  --manifest-path qdp/qdp-python/Cargo.toml --out dist/
+pip install --no-deps --force-reinstall dist/qumat_qdp-*.whl
+```
+
 ## 4. Benchmarks
 
 From the repo root, set up and prepare benchmarks:
diff --git a/qdp/qdp-core/Cargo.toml b/qdp/qdp-core/Cargo.toml
index c09f6339e1..21051f1ffb 100644
--- a/qdp/qdp-core/Cargo.toml
+++ b/qdp/qdp-core/Cargo.toml
@@ -4,8 +4,8 @@ version.workspace = true
 edition.workspace = true
 
 [dependencies]
-cudarc = { workspace = true }
-qdp-kernels = { path = "../qdp-kernels" }
+cudarc = { workspace = true, optional = true }
+qdp-kernels = { path = "../qdp-kernels", default-features = false }
 thiserror = { workspace = true }
 rayon = { workspace = true }
 nvtx = { version = "1.3", optional = true }
@@ -30,7 +30,14 @@ protoc-bin-vendored = { workspace = true }
 name = "qdp_core"
 
 [features]
-default = []
+# `cuda` and `hip` are mutually exclusive; pick exactly one vendor backend.
+# If both end up enabled (e.g. via workspace feature unification), `hip` takes
+# precedence: kernels build for HIP and cudarc is compiled but unused.
+# NVIDIA CUDA via cudarc + nvcc kernels (default, unchanged behavior).
+default = ["cuda"]
+cuda = ["dep:cudarc", "qdp-kernels/cuda"]
+# AMD HIP: hipcc kernels + the qdp-kernels device shim; no cudarc.
+hip = ["qdp-kernels/hip"]
 observability = ["nvtx"]
 pytorch = ["tch"]
 remote-io = ["object_store", "tokio", "tempfile", "futures"]
diff --git a/qdp/qdp-core/build.rs b/qdp/qdp-core/build.rs
index 311ea139dd..e416fe8257 100644
--- a/qdp/qdp-core/build.rs
+++ b/qdp/qdp-core/build.rs
@@ -15,6 +15,19 @@
 // limitations under the License.
 
 fn main() {
+    // Emit qdp_gpu_platform cfg on any OS where the GPU stack is compiled.
+    // Linux always has it (original target). Windows gets it when the `hip`
+    // feature is active (TheRock-based ROCm; the feature is set by QDP_USE_HIP=1).
+    // Source code that was `#[cfg(target_os = "linux")]` should use
+    // `#[cfg(qdp_gpu_platform)]` so it compiles on both.
+    println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)");
+    let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux");
+    let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok();
+    let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows");
+    if is_linux || (is_windows && hip_feature) {
+        println!("cargo::rustc-cfg=qdp_gpu_platform");
+    }
+
     // Use vendored protoc to avoid missing protoc in CI/dev environments
     unsafe {
         std::env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap());
diff --git a/qdp/qdp-core/src/dlpack.rs b/qdp/qdp-core/src/dlpack.rs
index a15344ee89..cea99b68a5 100644
--- a/qdp/qdp-core/src/dlpack.rs
+++ b/qdp/qdp-core/src/dlpack.rs
@@ -16,14 +16,14 @@
 
 // DLPack protocol for zero-copy GPU memory sharing with PyTorch
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{BufferStorage, GpuDeviceType, GpuStateVector, Precision};
 use std::os::raw::{c_int, c_void};
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_ffi::{
     CUDA_EVENT_DISABLE_TIMING, cudaEventCreateWithFlags, cudaEventDestroy, cudaEventRecord,
     cudaStreamWaitEvent,
@@ -45,7 +45,7 @@ pub fn dlpack_stream_to_cuda(stream: i64) -> *mut c_void {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 /// # Safety
 /// `stream` must be a valid CUDA stream pointer or one of the CUDA sentinel
 /// values (legacy/per-thread default). Passing any other pointer is undefined.
@@ -96,7 +96,7 @@ pub unsafe fn synchronize_stream(stream: *mut c_void) -> Result<()> {
     Ok(())
 }
 
-#[cfg(not(target_os = "linux"))]
+#[cfg(not(qdp_gpu_platform))]
 /// # Safety
 /// No-op on non-Linux targets, kept unsafe to match the Linux signature.
 pub unsafe fn synchronize_stream(_stream: *mut c_void) -> Result<()> {
diff --git a/qdp/qdp-core/src/encoding/amplitude.rs b/qdp/qdp-core/src/encoding/amplitude.rs
index fe51140e2d..fada74770d 100644
--- a/qdp/qdp-core/src/encoding/amplitude.rs
+++ b/qdp/qdp-core/src/encoding/amplitude.rs
@@ -22,7 +22,7 @@
 
 use std::ffi::c_void;
 
-use cudarc::driver::{CudaSlice, DevicePtrMut};
+use crate::gpu_rt::{CudaSlice, DevicePtrMut};
 use qdp_kernels::{launch_amplitude_encode_batch, launch_l2_norm_batch};
 
 use super::{ChunkEncoder, STAGE_SIZE_ELEMENTS};
@@ -135,7 +135,7 @@ impl ChunkEncoder for AmplitudeEncoder {
 mod tests {
     use super::*;
     use crate::MahoutError;
-    use cudarc::driver::DeviceSlice;
+    use crate::gpu_rt::DeviceSlice;
 
     #[test]
     fn reject_sample_size_zero() {
diff --git a/qdp/qdp-core/src/encoding/angle.rs b/qdp/qdp-core/src/encoding/angle.rs
index b2141d104e..7cd8c371a9 100644
--- a/qdp/qdp-core/src/encoding/angle.rs
+++ b/qdp/qdp-core/src/encoding/angle.rs
@@ -130,9 +130,9 @@ mod tests {
     use crate::encoding::STAGE_SIZE_ELEMENTS;
     // chunk-size overflow checks
     #[test]
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn test_encode_chunk_overflow() {
-        use cudarc::driver::CudaDevice;
+        use crate::gpu_rt::CudaDevice;
         use std::sync::Arc;
 
         let device: Arc<CudaDevice> = match CudaDevice::new(0) {
diff --git a/qdp/qdp-core/src/encoding/basis.rs b/qdp/qdp-core/src/encoding/basis.rs
index 801e077688..8bc2f19baf 100644
--- a/qdp/qdp-core/src/encoding/basis.rs
+++ b/qdp/qdp-core/src/encoding/basis.rs
@@ -22,7 +22,7 @@
 
 use std::ffi::c_void;
 
-use cudarc::driver::{CudaSlice, DevicePtr};
+use crate::gpu_rt::{CudaSlice, DevicePtr};
 use qdp_kernels::launch_basis_encode_batch;
 
 use super::{ChunkEncoder, STAGE_SIZE_ELEMENTS};
diff --git a/qdp/qdp-core/src/encoding/mod.rs b/qdp/qdp-core/src/encoding/mod.rs
index 2d09b51460..fab09ddc70 100644
--- a/qdp/qdp-core/src/encoding/mod.rs
+++ b/qdp/qdp-core/src/encoding/mod.rs
@@ -25,7 +25,7 @@ use std::sync::Arc;
 use std::sync::mpsc::{Receiver, SyncSender, sync_channel};
 use std::thread::{self, JoinHandle};
 
-use cudarc::driver::{CudaDevice, DevicePtr};
+use crate::gpu_rt::{CudaDevice, DevicePtr};
 
 /// Guard that ensures GPU synchronization and IO thread cleanup on drop.
 /// Used to handle early returns in `stream_encode`.
diff --git a/qdp/qdp-core/src/error.rs b/qdp/qdp-core/src/error.rs
index 09392bf552..3a03b82690 100644
--- a/qdp/qdp-core/src/error.rs
+++ b/qdp/qdp-core/src/error.rs
@@ -53,7 +53,7 @@ pub enum MahoutError {
 pub type Result<T> = std::result::Result<T, MahoutError>;
 
 /// Convert CUDA error code to human-readable string
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub fn cuda_error_to_string(code: i32) -> &'static str {
     match code {
         0 => "cudaSuccess",
diff --git a/qdp/qdp-core/src/gpu/buffer_pool.rs b/qdp/qdp-core/src/gpu/buffer_pool.rs
index 0b0ec24010..5a337cc215 100644
--- a/qdp/qdp-core/src/gpu/buffer_pool.rs
+++ b/qdp/qdp-core/src/gpu/buffer_pool.rs
@@ -23,17 +23,17 @@ use std::time::Instant;
 
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::PinnedHostBuffer;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::pool_metrics::PoolMetrics;
 
 /// Handle that automatically returns a buffer to the pool on drop.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub struct PinnedBufferHandle<T: Copy = f64> {
     buffer: Option<PinnedHostBuffer<T>>,
     pool: Arc<PinnedBufferPool<T>>,
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> std::ops::Deref for PinnedBufferHandle<T> {
     type Target = PinnedHostBuffer<T>;
 
@@ -44,7 +44,7 @@ impl<T: Copy> std::ops::Deref for PinnedBufferHandle<T> {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> std::ops::DerefMut for PinnedBufferHandle<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         self.buffer
@@ -53,7 +53,7 @@ impl<T: Copy> std::ops::DerefMut for PinnedBufferHandle<T> {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> Drop for PinnedBufferHandle<T> {
     fn drop(&mut self) {
         if let Some(buf) = self.buffer.take() {
@@ -65,7 +65,7 @@ impl<T: Copy> Drop for PinnedBufferHandle<T> {
 }
 
 /// Pool of pinned host buffers sized for a fixed batch shape.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub struct PinnedBufferPool<T: Copy = f64> {
     free: Mutex<Vec<PinnedHostBuffer<T>>>,
     available_cv: Condvar,
@@ -73,7 +73,7 @@ pub struct PinnedBufferPool<T: Copy = f64> {
     elements_per_buffer: usize,
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> PinnedBufferPool<T> {
     /// Create a pool with `pool_size` pinned buffers, each sized for `elements_per_buffer` values of `T`.
     pub fn new(pool_size: usize, elements_per_buffer: usize) -> Result<Arc<Self>> {
diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs
index 2ed60c311e..3fd69ef834 100644
--- a/qdp/qdp-core/src/gpu/cuda_ffi.rs
+++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs
@@ -14,11 +14,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! Centralized CUDA Runtime API FFI declarations.
+//! Centralized GPU runtime API FFI declarations.
+//!
+//! These are the runtime entry points the pinned-memory pool, OOM guard, and
+//! dual-stream pipeline call directly (outside the cudarc/`gpu_rt` slice). The
+//! public function names keep their `cuda*` spelling so call sites are
+//! unchanged across vendors. On the default `cuda` feature they bind libcudart
+//! directly; on the `hip` feature they are thin wrappers over the matching
+//! libamdhip64 entry points (which are 1:1, and whose status codes match
+//! CUDA's numerically for the codes used here).
 
 use std::ffi::c_void;
 
 pub(crate) const CUDA_MEMCPY_HOST_TO_DEVICE: u32 = 1;
+#[allow(dead_code)]
+pub(crate) const CUDA_MEMCPY_DEVICE_TO_HOST: u32 = 2;
 pub(crate) const CUDA_EVENT_DISABLE_TIMING: u32 = 0x02;
 pub(crate) const CUDA_EVENT_DEFAULT: u32 = 0x00;
 #[allow(dead_code)]
@@ -37,65 +47,247 @@ pub(crate) struct CudaPointerAttributes {
     pub allocation_flags: u32,
 }
 
-// CUDA error codes
+// CUDA/HIP error codes (numerically identical for the codes used).
 pub(crate) const CUDA_SUCCESS: i32 = 0;
-// Note: CUDA_ERROR_NOT_READY may be used in future optimizations for non-blocking event checks
-// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0028
 #[allow(dead_code)]
 pub(crate) const CUDA_ERROR_NOT_READY: i32 = 34;
 
-unsafe extern "C" {
-    pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: u32) -> i32;
-    pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32;
+// ---- CUDA backend: bind libcudart directly ----
+#[cfg(all(feature = "cuda", not(feature = "hip")))]
+pub(crate) use cuda_rt::*;
+
+#[cfg(all(feature = "cuda", not(feature = "hip")))]
+mod cuda_rt {
+    use super::CudaPointerAttributes;
+    use std::ffi::c_void;
+
+    unsafe extern "C" {
+        pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: u32) -> i32;
+        pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32;
+
+        #[allow(dead_code)]
+        pub(crate) fn cudaPointerGetAttributes(
+            attributes: *mut CudaPointerAttributes,
+            ptr: *const c_void,
+        ) -> i32;
+
+        pub(crate) fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
+
+        pub(crate) fn cudaMemcpyAsync(
+            dst: *mut c_void,
+            src: *const c_void,
+            count: usize,
+            kind: u32,
+            stream: *mut c_void,
+        ) -> i32;
+
+        #[allow(dead_code)]
+        pub(crate) fn cudaMemcpy(
+            dst: *mut c_void,
+            src: *const c_void,
+            count: usize,
+            kind: u32,
+        ) -> i32;
+
+        pub(crate) fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32;
+        pub(crate) fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32;
+        pub(crate) fn cudaEventDestroy(event: *mut c_void) -> i32;
+        pub(crate) fn cudaStreamWaitEvent(
+            stream: *mut c_void,
+            event: *mut c_void,
+            flags: u32,
+        ) -> i32;
+        pub(crate) fn cudaStreamSynchronize(stream: *mut c_void) -> i32;
+
+        pub(crate) fn cudaMemsetAsync(
+            devPtr: *mut c_void,
+            value: i32,
+            count: usize,
+            stream: *mut c_void,
+        ) -> i32;
+
+        #[allow(dead_code)]
+        pub(crate) fn cudaEventQuery(event: *mut c_void) -> i32;
+        pub(crate) fn cudaEventSynchronize(event: *mut c_void) -> i32;
+        pub(crate) fn cudaEventElapsedTime(
+            ms: *mut f32,
+            start: *mut c_void,
+            end: *mut c_void,
+        ) -> i32;
+    }
+}
+
+// ---- HIP backend: bind libamdhip64, expose the same cuda* names ----
+#[cfg(feature = "hip")]
+pub(crate) use hip_rt::*;
+
+// The wrapper functions deliberately keep the cuda* spelling so call sites are
+// vendor-agnostic; suppress the snake_case lint for that intentional naming.
+#[cfg(feature = "hip")]
+#[allow(non_snake_case)]
+mod hip_rt {
+    use super::{CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes};
+    use std::ffi::c_void;
+
+    // hipMemoryType enum values are NOT guaranteed equal to CUDA's across ROCm
+    // releases (older HIP used Host=0/Device=1; the hip_runtime_api.h note flags
+    // this explicitly). So we read the real hipPointerAttribute_t and compare its
+    // `type` against the named hipMemoryType* constants rather than a magic
+    // number, then translate to the CUDA convention the caller expects.
+    const HIP_MEMORY_TYPE_DEVICE: i32 = 2; // hipMemoryTypeDevice
+    const HIP_MEMORY_TYPE_MANAGED: i32 = 3; // hipMemoryTypeManaged
+
+    // Mirror of hipPointerAttribute_t (ROCm hip_runtime_api.h): the leading
+    // `type` field is the hipMemoryType enum read by cudaPointerGetAttributes.
+    #[repr(C)]
+    struct HipPointerAttributes {
+        memory_type: i32,
+        device: i32,
+        device_pointer: *mut c_void,
+        host_pointer: *mut c_void,
+        is_managed: i32,
+        allocation_flags: u32,
+    }
+
+    unsafe extern "C" {
+        fn hipHostMalloc(ptr: *mut *mut c_void, size: usize, flags: u32) -> i32;
+        fn hipHostFree(ptr: *mut c_void) -> i32;
+        fn hipPointerGetAttributes(attributes: *mut c_void, ptr: *const c_void) -> i32;
+        fn hipMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
+        fn hipMemcpyAsync(
+            dst: *mut c_void,
+            src: *const c_void,
+            count: usize,
+            kind: u32,
+            stream: *mut c_void,
+        ) -> i32;
+        fn hipMemcpy(dst: *mut c_void, src: *const c_void, count: usize, kind: u32) -> i32;
+        fn hipEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32;
+        fn hipEventRecord(event: *mut c_void, stream: *mut c_void) -> i32;
+        fn hipEventDestroy(event: *mut c_void) -> i32;
+        fn hipStreamWaitEvent(stream: *mut c_void, event: *mut c_void, flags: u32) -> i32;
+        fn hipStreamSynchronize(stream: *mut c_void) -> i32;
+        fn hipMemsetAsync(dst: *mut c_void, value: i32, count: usize, stream: *mut c_void) -> i32;
+        fn hipEventQuery(event: *mut c_void) -> i32;
+        fn hipEventSynchronize(event: *mut c_void) -> i32;
+        fn hipEventElapsedTime(ms: *mut f32, start: *mut c_void, end: *mut c_void) -> i32;
+    }
+
+    // hipHostMallocDefault == 0, matching cudaHostAllocDefault used by callers.
+    pub(crate) unsafe fn cudaHostAlloc(p: *mut *mut c_void, size: usize, flags: u32) -> i32 {
+        unsafe { hipHostMalloc(p, size, flags) }
+    }
+    pub(crate) unsafe fn cudaFreeHost(ptr: *mut c_void) -> i32 {
+        unsafe { hipHostFree(ptr) }
+    }
 
     #[allow(dead_code)]
-    pub(crate) fn cudaPointerGetAttributes(
+    pub(crate) unsafe fn cudaPointerGetAttributes(
         attributes: *mut CudaPointerAttributes,
         ptr: *const c_void,
-    ) -> i32;
+    ) -> i32 {
+        let mut hip_attrs = HipPointerAttributes {
+            memory_type: 0,
+            device: 0,
+            device_pointer: std::ptr::null_mut(),
+            host_pointer: std::ptr::null_mut(),
+            is_managed: 0,
+            allocation_flags: 0,
+        };
+        let ret = unsafe { hipPointerGetAttributes(&mut hip_attrs as *mut _ as *mut c_void, ptr) };
+        if ret != 0 {
+            return ret;
+        }
+        // Translate the hipMemoryType enum to the CUDA convention the caller
+        // checks against, comparing the named hipMemoryType* values explicitly
+        // (do not assume the numeric enum equals CUDA's). Anything else stays
+        // verbatim so the caller's "not device memory" branch still fires.
+        let memory_type = match hip_attrs.memory_type {
+            HIP_MEMORY_TYPE_DEVICE => CUDA_MEMORY_TYPE_DEVICE,
+            HIP_MEMORY_TYPE_MANAGED => CUDA_MEMORY_TYPE_MANAGED,
+            other => other,
+        };
+        unsafe {
+            *attributes = CudaPointerAttributes {
+                memory_type,
+                device: hip_attrs.device,
+                device_pointer: hip_attrs.device_pointer,
+                host_pointer: hip_attrs.host_pointer,
+                is_managed: hip_attrs.is_managed,
+                allocation_flags: hip_attrs.allocation_flags,
+            };
+        }
+        0
+    }
 
-    pub(crate) fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32;
+    pub(crate) unsafe fn cudaMemGetInfo(free: *mut usize, total: *mut usize) -> i32 {
+        unsafe { hipMemGetInfo(free, total) }
+    }
 
-    pub(crate) fn cudaMemcpyAsync(
+    // hipMemcpyAsync is the exact 1:1 of cudaMemcpyAsync: it enqueues on the
+    // stream and returns without blocking the host, preserving the dual-stream
+    // H2D/compute overlap. (hipMemcpyWithStream would synchronize the stream
+    // before returning, serializing the pipeline.)
+    pub(crate) unsafe fn cudaMemcpyAsync(
         dst: *mut c_void,
         src: *const c_void,
         count: usize,
         kind: u32,
         stream: *mut c_void,
-    ) -> i32;
+    ) -> i32 {
+        unsafe { hipMemcpyAsync(dst, src, count, kind, stream) }
+    }
 
-    pub(crate) fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32;
-    pub(crate) fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32;
-    pub(crate) fn cudaEventDestroy(event: *mut c_void) -> i32;
-    pub(crate) fn cudaStreamWaitEvent(stream: *mut c_void, event: *mut c_void, flags: u32) -> i32;
-    pub(crate) fn cudaStreamSynchronize(stream: *mut c_void) -> i32;
+    #[allow(dead_code)]
+    pub(crate) unsafe fn cudaMemcpy(
+        dst: *mut c_void,
+        src: *const c_void,
+        count: usize,
+        kind: u32,
+    ) -> i32 {
+        unsafe { hipMemcpy(dst, src, count, kind) }
+    }
 
-    pub(crate) fn cudaMemsetAsync(
-        devPtr: *mut c_void,
+    pub(crate) unsafe fn cudaEventCreateWithFlags(event: *mut *mut c_void, flags: u32) -> i32 {
+        unsafe { hipEventCreateWithFlags(event, flags) }
+    }
+    pub(crate) unsafe fn cudaEventRecord(event: *mut c_void, stream: *mut c_void) -> i32 {
+        unsafe { hipEventRecord(event, stream) }
+    }
+    pub(crate) unsafe fn cudaEventDestroy(event: *mut c_void) -> i32 {
+        unsafe { hipEventDestroy(event) }
+    }
+    pub(crate) unsafe fn cudaStreamWaitEvent(
+        stream: *mut c_void,
+        event: *mut c_void,
+        flags: u32,
+    ) -> i32 {
+        unsafe { hipStreamWaitEvent(stream, event, flags) }
+    }
+    pub(crate) unsafe fn cudaStreamSynchronize(stream: *mut c_void) -> i32 {
+        unsafe { hipStreamSynchronize(stream) }
+    }
+    pub(crate) unsafe fn cudaMemsetAsync(
+        dev_ptr: *mut c_void,
         value: i32,
         count: usize,
         stream: *mut c_void,
-    ) -> i32;
-
-    /// Non-blocking event query
-    ///
-    /// Returns CUDA_SUCCESS if the event has completed, CUDA_ERROR_NOT_READY if not.
-    /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
-    ///
-    /// Note: May be used in future optimizations for non-blocking event checks to reduce
-    /// synchronization overhead in pipeline operations.
+    ) -> i32 {
+        unsafe { hipMemsetAsync(dev_ptr, value, count, stream) }
+    }
+
     #[allow(dead_code)]
-    pub(crate) fn cudaEventQuery(event: *mut c_void) -> i32;
-
-    /// Blocking event synchronization
-    ///
-    /// Waits until the completion of all work currently captured in the event.
-    /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
-    pub(crate) fn cudaEventSynchronize(event: *mut c_void) -> i32;
-
-    /// Calculate elapsed time between two events (in milliseconds)
-    ///
-    /// Both events must have been created with CUDA_EVENT_DEFAULT flag.
-    /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
-    pub(crate) fn cudaEventElapsedTime(ms: *mut f32, start: *mut c_void, end: *mut c_void) -> i32;
+    pub(crate) unsafe fn cudaEventQuery(event: *mut c_void) -> i32 {
+        unsafe { hipEventQuery(event) }
+    }
+    pub(crate) unsafe fn cudaEventSynchronize(event: *mut c_void) -> i32 {
+        unsafe { hipEventSynchronize(event) }
+    }
+    pub(crate) unsafe fn cudaEventElapsedTime(
+        ms: *mut f32,
+        start: *mut c_void,
+        end: *mut c_void,
+    ) -> i32 {
+        unsafe { hipEventElapsedTime(ms, start, end) }
+    }
 }
diff --git a/qdp/qdp-core/src/gpu/cuda_sync.rs b/qdp/qdp-core/src/gpu/cuda_sync.rs
index 077da077dc..2a33d8f276 100644
--- a/qdp/qdp-core/src/gpu/cuda_sync.rs
+++ b/qdp/qdp-core/src/gpu/cuda_sync.rs
@@ -16,7 +16,7 @@
 
 //! Shared CUDA stream synchronization with unified error reporting.
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 use crate::error::{MahoutError, Result, cuda_error_to_string};
@@ -32,7 +32,7 @@ use crate::error::{MahoutError, Result, cuda_error_to_string};
 ///
 /// # Safety
 /// The stream pointer must be valid for the duration of this call.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) fn sync_cuda_stream(stream: *mut c_void, context: &str) -> Result<()> {
     let ret = unsafe { crate::gpu::cuda_ffi::cudaStreamSynchronize(stream) };
     if ret != 0 {
@@ -46,7 +46,7 @@ pub(crate) fn sync_cuda_stream(stream: *mut c_void, context: &str) -> Result<()>
     Ok(())
 }
 
-#[cfg(all(test, target_os = "linux"))]
+#[cfg(all(test, qdp_gpu_platform))]
 mod tests {
     use super::*;
     use std::ffi::c_void;
diff --git a/qdp/qdp-core/src/gpu/encodings/amplitude.rs b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
index 7cf70d9ec5..798498b4a7 100644
--- a/qdp/qdp-core/src/gpu/encodings/amplitude.rs
+++ b/qdp/qdp-core/src/gpu/encodings/amplitude.rs
@@ -23,27 +23,27 @@
 use std::sync::Arc;
 
 use super::QuantumEncoder;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{GpuStateVector, Precision};
 use crate::gpu::pipeline::run_dual_stream_pipeline;
-use cudarc::driver::CudaDevice;
+use crate::gpu_rt::CudaDevice;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_ffi::cudaMemsetAsync;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_sync::sync_cuda_stream;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
-#[cfg(target_os = "linux")]
-use cudarc::driver::{DevicePtr, DevicePtrMut};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use crate::gpu_rt::{DevicePtr, DevicePtrMut};
+#[cfg(qdp_gpu_platform)]
 use qdp_kernels::{
     launch_amplitude_encode, launch_amplitude_encode_batch, launch_amplitude_encode_batch_f32,
     launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32, launch_l2_norm_f32,
 };
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 use crate::preprocessing::Preprocessor;
@@ -65,7 +65,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         Preprocessor::validate_input(host_data, num_qubits)?;
         let state_len = 1 << num_qubits;
 
-        #[cfg(target_os = "linux")]
+        #[cfg(qdp_gpu_platform)]
         {
             // Allocate GPU state vector
             let state_vector = {
@@ -178,7 +178,7 @@ impl QuantumEncoder for AmplitudeEncoder {
             Ok(state_vector)
         }
 
-        #[cfg(not(target_os = "linux"))]
+        #[cfg(not(qdp_gpu_platform))]
         {
             Err(MahoutError::Cuda(
                 "CUDA unavailable (non-Linux stub)".to_string(),
@@ -187,7 +187,7 @@ impl QuantumEncoder for AmplitudeEncoder {
     }
 
     /// Encode multiple samples in a single GPU allocation and kernel launch
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch(
         &self,
         device: &Arc<CudaDevice>,
@@ -299,7 +299,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -361,7 +361,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -390,7 +390,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         };
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernel");
-            use cudarc::driver::DevicePtrMut;
+            use crate::gpu_rt::DevicePtrMut;
             let mut buffer = device.alloc_zeros::<f64>(num_samples).map_err(|e| {
                 MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
             })?;
@@ -414,6 +414,12 @@ impl QuantumEncoder for AmplitudeEncoder {
         };
         {
             crate::profile_scope!("GPU::NormValidation");
+            // The norm kernel ran on the caller's stream, but dtoh_sync_copy reads
+            // back on the default stream. Synchronize the caller's stream first so
+            // the result is visible: with a non-blocking stream (which does not
+            // implicitly order against the default stream) the readback would
+            // otherwise race and observe the zero-initialized buffer.
+            sync_cuda_stream(stream, "Norm stream synchronize failed (batch)")?;
             let host_inv_norms = device
                 .dtoh_sync_copy(&inv_norms_gpu)
                 .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
@@ -425,7 +431,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         }
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
-            use cudarc::driver::DevicePtr;
+            use crate::gpu_rt::DevicePtr;
             let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
                 MahoutError::InvalidInput(
                     "Batch state vector precision mismatch (expected float64 buffer)".to_string(),
@@ -458,7 +464,7 @@ impl QuantumEncoder for AmplitudeEncoder {
     }
 
     /// Encode multiple samples in a single GPU allocation and kernel launch for f32 inputs
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -514,7 +520,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         // Compute inverse norms on GPU using warp-reduced kernel
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernel_f32");
-            use cudarc::driver::DevicePtrMut;
+            use crate::gpu_rt::DevicePtrMut;
             let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
                 MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
             })?;
@@ -556,7 +562,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         // Launch batch kernel
         {
             crate::profile_scope!("GPU::BatchKernelLaunch_f32");
-            use cudarc::driver::DevicePtr;
+            use crate::gpu_rt::DevicePtr;
             let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
                 MahoutError::InvalidInput(
                     "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
@@ -593,7 +599,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -622,7 +628,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         };
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernel_f32");
-            use cudarc::driver::DevicePtrMut;
+            use crate::gpu_rt::DevicePtrMut;
             let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
                 MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
             })?;
@@ -646,6 +652,12 @@ impl QuantumEncoder for AmplitudeEncoder {
         };
         {
             crate::profile_scope!("GPU::NormValidation_f32");
+            // The norm kernel ran on the caller's stream, but dtoh_sync_copy reads
+            // back on the default stream. Synchronize the caller's stream first so
+            // the result is visible: with a non-blocking stream (which does not
+            // implicitly order against the default stream) the readback would
+            // otherwise race and observe the zero-initialized buffer.
+            sync_cuda_stream(stream, "Norm stream synchronize failed (batch f32)")?;
             let host_inv_norms = device
                 .dtoh_sync_copy(&inv_norms_gpu)
                 .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
@@ -657,7 +669,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         }
         {
             crate::profile_scope!("GPU::BatchKernelLaunch_f32");
-            use cudarc::driver::DevicePtr;
+            use crate::gpu_rt::DevicePtr;
             let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
                 MahoutError::InvalidInput(
                     "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
@@ -689,7 +701,7 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -726,7 +738,7 @@ impl AmplitudeEncoder {
     /// data transfer and computation. The pipeline handles all the
     /// streaming mechanics, while this method focuses on the amplitude
     /// encoding kernel logic.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub(crate) fn encode_async_pipeline(
         device: &Arc<CudaDevice>,
         host_data: &[f64],
@@ -845,7 +857,7 @@ impl AmplitudeEncoder {
     /// Caller must ensure `input_d` points to at least `input_len` `f32` values in
     /// GPU-accessible memory on the same device as `device`, and `stream` is either
     /// null or a valid CUDA stream associated with `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_d: *const f32,
@@ -919,7 +931,7 @@ impl AmplitudeEncoder {
     /// # Safety
     /// The caller must ensure `input_batch_d` points to valid GPU memory containing
     /// at least `num_samples * sample_size` f32 elements on the same device as `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_batch_d: *const f32,
@@ -951,7 +963,7 @@ impl AmplitudeEncoder {
 
         let inv_norms_gpu = {
             crate::profile_scope!("GPU::BatchNormKernelF32");
-            use cudarc::driver::DevicePtrMut;
+            use crate::gpu_rt::DevicePtrMut;
 
             let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
                 MahoutError::MemoryAllocation(format!(
@@ -980,6 +992,13 @@ impl AmplitudeEncoder {
 
         {
             crate::profile_scope!("GPU::NormValidationF32");
+            // The norm kernel ran on the caller's stream, but dtoh_sync_copy
+            // reads back on the default stream. Synchronize the caller's stream
+            // first so the result is visible: with a non-blocking stream (which
+            // does not implicitly order against the default stream) the readback
+            // would otherwise race and observe the zero-initialized buffer. This
+            // mirrors the single-sample path (calculate_inv_norm_gpu_with_stream).
+            sync_cuda_stream(stream, "Norm stream synchronize failed (batch f32)")?;
             let host_inv_norms = device.dtoh_sync_copy(&inv_norms_gpu).map_err(|e| {
                 MahoutError::Cuda(format!("Failed to copy f32 norms to host: {:?}", e))
             })?;
@@ -992,7 +1011,7 @@ impl AmplitudeEncoder {
 
         {
             crate::profile_scope!("GPU::BatchKernelLaunchF32");
-            use cudarc::driver::DevicePtr;
+            use crate::gpu_rt::DevicePtr;
 
             let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
                 MahoutError::InvalidInput(
@@ -1040,7 +1059,7 @@ impl AmplitudeEncoder {
     /// # Safety
     /// The caller must ensure `input_ptr` points to valid GPU memory containing
     /// at least `len` f64 elements on the same device as `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub(crate) unsafe fn calculate_inv_norm_gpu(
         device: &Arc<CudaDevice>,
         input_ptr: *const f64,
@@ -1051,7 +1070,7 @@ impl AmplitudeEncoder {
         }
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub(crate) unsafe fn calculate_inv_norm_gpu_with_stream(
         device: &Arc<CudaDevice>,
         input_ptr: *const f64,
@@ -1111,7 +1130,7 @@ impl AmplitudeEncoder {
     /// # Safety
     /// The caller must ensure `input_ptr` points to valid GPU memory containing
     /// at least `len` f32 elements on the same device as `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn calculate_inv_norm_gpu_f32(
         device: &Arc<CudaDevice>,
         input_ptr: *const f32,
@@ -1132,7 +1151,7 @@ impl AmplitudeEncoder {
     /// # Safety
     /// The caller must ensure `input_ptr` points to valid GPU memory containing
     /// at least `len` f32 elements on the same device as `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn calculate_inv_norm_gpu_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_ptr: *const f32,
@@ -1180,7 +1199,7 @@ impl AmplitudeEncoder {
     }
 
     /// Run dual-stream pipeline for amplitude encoding (exposed for Python / benchmark).
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub(crate) fn run_amplitude_dual_stream_pipeline(
         device: &Arc<CudaDevice>,
         host_data: &[f64],
diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 36e784e035..313489d916 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -21,20 +21,20 @@
 #![allow(unused_unsafe)]
 
 use super::{QuantumEncoder, validate_qubit_count};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{GpuStateVector, Precision};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::pipeline::run_dual_stream_pipeline_aligned;
-use cudarc::driver::CudaDevice;
+use crate::gpu_rt::CudaDevice;
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::memory::map_allocation_error;
-#[cfg(target_os = "linux")]
-use cudarc::driver::DevicePtr;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use crate::gpu_rt::DevicePtr;
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 /// Angle encoding: each qubit uses one rotation angle to form a product state.
@@ -43,15 +43,15 @@ pub struct AngleEncoder;
 impl QuantumEncoder for AngleEncoder {
     fn encode(
         &self,
-        #[cfg(target_os = "linux")] device: &Arc<CudaDevice>,
-        #[cfg(not(target_os = "linux"))] _device: &Arc<CudaDevice>,
+        #[cfg(qdp_gpu_platform)] device: &Arc<CudaDevice>,
+        #[cfg(not(qdp_gpu_platform))] _device: &Arc<CudaDevice>,
         data: &[f64],
         num_qubits: usize,
     ) -> Result<GpuStateVector> {
         self.validate_input(data, num_qubits)?;
         let state_len = 1 << num_qubits;
 
-        #[cfg(target_os = "linux")]
+        #[cfg(qdp_gpu_platform)]
         {
             let input_bytes = std::mem::size_of_val(data);
             let angles_gpu = {
@@ -103,7 +103,7 @@ impl QuantumEncoder for AngleEncoder {
             Ok(state_vector)
         }
 
-        #[cfg(not(target_os = "linux"))]
+        #[cfg(not(qdp_gpu_platform))]
         {
             Err(MahoutError::Cuda(
                 "CUDA unavailable (non-Linux stub)".to_string(),
@@ -112,7 +112,7 @@ impl QuantumEncoder for AngleEncoder {
     }
 
     /// Encode multiple angle samples in a single GPU allocation and kernel launch
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch(
         &self,
         device: &Arc<CudaDevice>,
@@ -229,7 +229,7 @@ impl QuantumEncoder for AngleEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -294,7 +294,7 @@ impl QuantumEncoder for AngleEncoder {
         Ok(state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -374,7 +374,7 @@ impl QuantumEncoder for AngleEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -494,7 +494,7 @@ impl QuantumEncoder for AngleEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -595,7 +595,7 @@ impl QuantumEncoder for AngleEncoder {
         Ok(())
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -636,7 +636,7 @@ impl AngleEncoder {
     /// The caller must also ensure that `stream` is either null or a valid CUDA stream handle
     /// associated with `device`, and that no concurrent use of these raw pointers violates Rust's
     /// aliasing or lifetime rules.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_d: *const f32,
@@ -717,7 +717,7 @@ impl AngleEncoder {
     /// valid for the duration of this call. The caller must also ensure that `stream` is either
     /// null or a valid CUDA stream handle associated with `device`, and that the memory layout is
     /// row-major with exactly `sample_size` angles per sample.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_batch_d: *const f32,
@@ -738,7 +738,7 @@ impl AngleEncoder {
         }
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch_async_pipeline(
         device: &Arc<CudaDevice>,
         batch_data: &[f64],
@@ -802,7 +802,7 @@ impl AngleEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch_async_pipeline_f32(
         device: &Arc<CudaDevice>,
         batch_data: &[f32],
diff --git a/qdp/qdp-core/src/gpu/encodings/basis.rs b/qdp/qdp-core/src/gpu/encodings/basis.rs
index 4f2cec15e6..16e09d735a 100644
--- a/qdp/qdp-core/src/gpu/encodings/basis.rs
+++ b/qdp/qdp-core/src/gpu/encodings/basis.rs
@@ -21,18 +21,18 @@
 #![allow(unused_unsafe)]
 
 use super::{QuantumEncoder, validate_qubit_count};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{GpuStateVector, Precision};
-use cudarc::driver::CudaDevice;
+use crate::gpu_rt::CudaDevice;
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::memory::map_allocation_error;
-#[cfg(target_os = "linux")]
-use cudarc::driver::DevicePtr;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use crate::gpu_rt::DevicePtr;
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 /// Basis encoding: maps an integer index to a computational basis state.
@@ -51,8 +51,8 @@ pub struct BasisEncoder;
 impl QuantumEncoder for BasisEncoder {
     fn encode(
         &self,
-        #[cfg(target_os = "linux")] device: &Arc<CudaDevice>,
-        #[cfg(not(target_os = "linux"))] _device: &Arc<CudaDevice>,
+        #[cfg(qdp_gpu_platform)] device: &Arc<CudaDevice>,
+        #[cfg(not(qdp_gpu_platform))] _device: &Arc<CudaDevice>,
         data: &[f64],
         num_qubits: usize,
     ) -> Result<GpuStateVector> {
@@ -69,7 +69,7 @@ impl QuantumEncoder for BasisEncoder {
 
         let state_len = 1 << num_qubits;
 
-        #[cfg(target_os = "linux")]
+        #[cfg(qdp_gpu_platform)]
         {
             // Convert and validate the basis index
             let basis_index = Self::validate_basis_index(data[0], state_len)?;
@@ -116,7 +116,7 @@ impl QuantumEncoder for BasisEncoder {
             Ok(state_vector)
         }
 
-        #[cfg(not(target_os = "linux"))]
+        #[cfg(not(qdp_gpu_platform))]
         {
             Err(MahoutError::Cuda(
                 "CUDA unavailable (non-Linux stub)".to_string(),
@@ -125,7 +125,7 @@ impl QuantumEncoder for BasisEncoder {
     }
 
     /// Encode multiple basis indices in a single GPU allocation and kernel launch
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch(
         &self,
         device: &Arc<CudaDevice>,
@@ -225,7 +225,7 @@ impl QuantumEncoder for BasisEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -291,7 +291,7 @@ impl QuantumEncoder for BasisEncoder {
         Ok(state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -374,7 +374,7 @@ impl QuantumEncoder for BasisEncoder {
         Ok(())
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -466,7 +466,7 @@ impl QuantumEncoder for BasisEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -541,7 +541,7 @@ impl QuantumEncoder for BasisEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr_f32(
         &self,
         device: &Arc<CudaDevice>,
@@ -584,7 +584,7 @@ impl BasisEncoder {
     /// contiguous `f32` values in GPU-accessible memory and remains valid for the
     /// duration of this call. `stream` must be either null or a valid CUDA stream
     /// handle associated with `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_batch_d: *const f32,
@@ -613,7 +613,7 @@ impl BasisEncoder {
     /// `input_d` must point to one valid `f32` in GPU-accessible memory on `device`,
     /// remain valid for the duration of this call, and `stream` must be either null
     /// or a valid CUDA stream handle associated with `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_d: *const f32,
diff --git a/qdp/qdp-core/src/gpu/encodings/iqp.rs b/qdp/qdp-core/src/gpu/encodings/iqp.rs
index 33d18cfaf0..ce358724c7 100644
--- a/qdp/qdp-core/src/gpu/encodings/iqp.rs
+++ b/qdp/qdp-core/src/gpu/encodings/iqp.rs
@@ -17,19 +17,19 @@
 // IQP (Instantaneous Quantum Polynomial) encoding: entangled quantum states via diagonal phases.
 
 use super::{QuantumEncoder, validate_qubit_count};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{GpuStateVector, Precision};
-use cudarc::driver::CudaDevice;
+use crate::gpu_rt::CudaDevice;
 use std::sync::Arc;
 use std::sync::OnceLock;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::memory::map_allocation_error;
-#[cfg(target_os = "linux")]
-use cudarc::driver::DevicePtr;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use crate::gpu_rt::DevicePtr;
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 /// IQP encoding: creates entangled quantum states using diagonal phase gates.
@@ -69,15 +69,15 @@ impl IqpEncoder {
 impl QuantumEncoder for IqpEncoder {
     fn encode(
         &self,
-        #[cfg(target_os = "linux")] device: &Arc<CudaDevice>,
-        #[cfg(not(target_os = "linux"))] _device: &Arc<CudaDevice>,
+        #[cfg(qdp_gpu_platform)] device: &Arc<CudaDevice>,
+        #[cfg(not(qdp_gpu_platform))] _device: &Arc<CudaDevice>,
         data: &[f64],
         num_qubits: usize,
     ) -> Result<GpuStateVector> {
         self.validate_input(data, num_qubits)?;
         let state_len = 1 << num_qubits;
 
-        #[cfg(target_os = "linux")]
+        #[cfg(qdp_gpu_platform)]
         {
             let input_bytes = std::mem::size_of_val(data);
             let data_gpu = {
@@ -130,7 +130,7 @@ impl QuantumEncoder for IqpEncoder {
             Ok(state_vector)
         }
 
-        #[cfg(not(target_os = "linux"))]
+        #[cfg(not(qdp_gpu_platform))]
         {
             Err(MahoutError::Cuda(
                 "CUDA unavailable (non-Linux stub)".to_string(),
@@ -139,7 +139,7 @@ impl QuantumEncoder for IqpEncoder {
     }
 
     /// Encode multiple IQP samples in a single GPU allocation and kernel launch
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch(
         &self,
         device: &Arc<CudaDevice>,
@@ -238,7 +238,7 @@ impl QuantumEncoder for IqpEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -302,7 +302,7 @@ impl QuantumEncoder for IqpEncoder {
         Ok(state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
diff --git a/qdp/qdp-core/src/gpu/encodings/mod.rs b/qdp/qdp-core/src/gpu/encodings/mod.rs
index 8d0fd5b4cd..208da9423b 100644
--- a/qdp/qdp-core/src/gpu/encodings/mod.rs
+++ b/qdp/qdp-core/src/gpu/encodings/mod.rs
@@ -20,9 +20,9 @@ use std::sync::Arc;
 
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::GpuStateVector;
+use crate::gpu_rt::CudaDevice;
 use crate::preprocessing::Preprocessor;
-use cudarc::driver::CudaDevice;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 /// Maximum number of qubits supported (16GB GPU memory limit)
@@ -98,7 +98,7 @@ pub trait QuantumEncoder: Send + Sync + 'static {
     /// # Safety
     /// Caller must ensure `input_d` points to valid GPU memory with at least `input_len` elements
     /// of the expected dtype on the same device as `device`, and `stream` is a valid CUDA stream or null.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr(
         &self,
         _device: &Arc<CudaDevice>,
@@ -119,7 +119,7 @@ pub trait QuantumEncoder: Send + Sync + 'static {
     /// Caller must ensure `input_batch_d` points to valid GPU memory with at least
     /// `num_samples * sample_size` elements of the expected dtype on the same device as `device`,
     /// and `stream` is a valid CUDA stream or null.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr(
         &self,
         _device: &Arc<CudaDevice>,
@@ -148,7 +148,7 @@ pub trait QuantumEncoder: Send + Sync + 'static {
     /// Caller must ensure `input_d` points to valid GPU memory with at least `input_len`
     /// `f32` elements on the same device as `device`, and `stream` is either null or a valid
     /// CUDA stream associated with `device`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr_f32(
         &self,
         _device: &Arc<CudaDevice>,
@@ -182,7 +182,7 @@ pub trait QuantumEncoder: Send + Sync + 'static {
     ///
     /// # Safety
     /// Caller must ensure `input_batch_d` points to valid GPU memory (f32).
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr_f32(
         &self,
         _device: &Arc<CudaDevice>,
diff --git a/qdp/qdp-core/src/gpu/encodings/phase.rs b/qdp/qdp-core/src/gpu/encodings/phase.rs
index b329a2429e..bacb4ad0d6 100644
--- a/qdp/qdp-core/src/gpu/encodings/phase.rs
+++ b/qdp/qdp-core/src/gpu/encodings/phase.rs
@@ -34,20 +34,20 @@
 #![allow(unused_unsafe)]
 
 use super::{QuantumEncoder, validate_qubit_count};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
 use crate::gpu::memory::{GpuStateVector, Precision};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::pipeline::run_dual_stream_pipeline_aligned;
-use cudarc::driver::CudaDevice;
+use crate::gpu_rt::CudaDevice;
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::memory::map_allocation_error;
-#[cfg(target_os = "linux")]
-use cudarc::driver::DevicePtr;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use crate::gpu_rt::DevicePtr;
+#[cfg(qdp_gpu_platform)]
 use std::ffi::c_void;
 
 /// Phase encoding: per-qubit P(φ = x_k) gates applied to |+⟩^⊗N.
@@ -66,15 +66,15 @@ pub struct PhaseEncoder;
 impl QuantumEncoder for PhaseEncoder {
     fn encode(
         &self,
-        #[cfg(target_os = "linux")] device: &Arc<CudaDevice>,
-        #[cfg(not(target_os = "linux"))] _device: &Arc<CudaDevice>,
+        #[cfg(qdp_gpu_platform)] device: &Arc<CudaDevice>,
+        #[cfg(not(qdp_gpu_platform))] _device: &Arc<CudaDevice>,
         data: &[f64],
         num_qubits: usize,
     ) -> Result<GpuStateVector> {
         self.validate_input(data, num_qubits)?;
         let state_len = 1 << num_qubits;
 
-        #[cfg(target_os = "linux")]
+        #[cfg(qdp_gpu_platform)]
         {
             let input_bytes = std::mem::size_of_val(data);
             let phases_gpu = {
@@ -126,7 +126,7 @@ impl QuantumEncoder for PhaseEncoder {
             Ok(state_vector)
         }
 
-        #[cfg(not(target_os = "linux"))]
+        #[cfg(not(qdp_gpu_platform))]
         {
             Err(MahoutError::Cuda(
                 "CUDA unavailable (non-Linux stub)".to_string(),
@@ -135,7 +135,7 @@ impl QuantumEncoder for PhaseEncoder {
     }
 
     /// Encode multiple phase samples in a single GPU allocation and kernel launch.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch(
         &self,
         device: &Arc<CudaDevice>,
@@ -241,7 +241,7 @@ impl QuantumEncoder for PhaseEncoder {
         Ok(batch_state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -297,7 +297,7 @@ impl QuantumEncoder for PhaseEncoder {
         Ok(state_vector)
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     unsafe fn encode_batch_from_gpu_ptr(
         &self,
         device: &Arc<CudaDevice>,
@@ -325,7 +325,7 @@ impl QuantumEncoder for PhaseEncoder {
         // through the norm and is caught on the host side.
         let phase_validation_buffer = {
             crate::profile_scope!("GPU::PhaseFiniteCheckBatch");
-            use cudarc::driver::DevicePtrMut;
+            use crate::gpu_rt::DevicePtrMut;
             let mut buffer = device.alloc_zeros::<f64>(num_samples).map_err(|e| {
                 MahoutError::MemoryAllocation(format!(
                     "Failed to allocate phase validation buffer: {:?}",
@@ -353,6 +353,15 @@ impl QuantumEncoder for PhaseEncoder {
 
         {
             crate::profile_scope!("GPU::PhaseFiniteValidationHostCopy");
+            // The norm probe ran on the caller's stream, but dtoh_sync_copy reads
+            // back on the default stream. Synchronize the caller's stream first so
+            // the result is visible: with a non-blocking stream (which does not
+            // implicitly order against the default stream) the readback would
+            // otherwise race and observe the zero-initialized buffer.
+            crate::gpu::cuda_sync::sync_cuda_stream(
+                stream,
+                "Phase validation norm stream synchronize failed (batch)",
+            )?;
             let host_norms = device
                 .dtoh_sync_copy(&phase_validation_buffer)
                 .map_err(|e| {
@@ -437,7 +446,7 @@ impl QuantumEncoder for PhaseEncoder {
 }
 
 impl PhaseEncoder {
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn encode_batch_async_pipeline(
         device: &Arc<CudaDevice>,
         batch_data: &[f64],
diff --git a/qdp/qdp-core/src/gpu/memory.rs b/qdp/qdp-core/src/gpu/memory.rs
index f68461a6e2..20afd6c97e 100644
--- a/qdp/qdp-core/src/gpu/memory.rs
+++ b/qdp/qdp-core/src/gpu/memory.rs
@@ -19,13 +19,13 @@
 #![allow(unused_unsafe)]
 
 use crate::error::{MahoutError, Result};
-use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr};
+use crate::gpu_rt::{CudaDevice, CudaSlice, DevicePtr};
 use qdp_kernels::{CuComplex, CuDoubleComplex};
 use std::ffi::c_void;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::error::cuda_error_to_string;
 
 /// Precision of the GPU state vector.
@@ -42,15 +42,23 @@ pub enum GpuDeviceType {
     Rocm,
 }
 
-#[cfg(target_os = "linux")]
+/// The device type of the backend this build targets. Drives the DLPack
+/// `device_type` so exported tensors are tagged kDLCUDA on the NVIDIA build and
+/// kDLROCM on the HIP build (a ROCm PyTorch's `from_dlpack` rejects a CUDA tag).
+#[cfg(not(feature = "hip"))]
+pub(crate) const NATIVE_GPU_DEVICE_TYPE: GpuDeviceType = GpuDeviceType::Cuda;
+#[cfg(feature = "hip")]
+pub(crate) const NATIVE_GPU_DEVICE_TYPE: GpuDeviceType = GpuDeviceType::Rocm;
+
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_ffi::{cudaFreeHost, cudaHostAlloc, cudaMemGetInfo};
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn bytes_to_mib(bytes: usize) -> f64 {
     bytes as f64 / (1024.0 * 1024.0)
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn query_cuda_mem_info() -> Result<(usize, usize)> {
     unsafe {
         let mut free_bytes: usize = 0;
@@ -72,7 +80,7 @@ fn query_cuda_mem_info() -> Result<(usize, usize)> {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn build_oom_message(
     context: &str,
     requested_bytes: usize,
@@ -96,7 +104,7 @@ fn build_oom_message(
 ///
 /// Returns a MemoryAllocation error with a helpful message when the request
 /// exceeds the currently reported free memory.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) fn ensure_device_memory_available(
     requested_bytes: usize,
     context: &str,
@@ -118,7 +126,7 @@ pub(crate) fn ensure_device_memory_available(
 }
 
 /// Wraps CUDA allocation errors with an OOM-aware MahoutError.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) fn map_allocation_error(
     requested_bytes: usize,
     context: &str,
@@ -231,7 +239,7 @@ impl GpuStateVector {
     /// Create GPU state vector for n qubits with the given precision.
     /// Allocates 2^n complex numbers (Float32 = CuComplex, Float64 = CuDoubleComplex).
     /// Default for most callers: use `Precision::Float64`.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub fn new(_device: &Arc<CudaDevice>, qubits: usize, precision: Precision) -> Result<Self> {
         let _size_elements: usize = 1usize << qubits;
 
@@ -299,11 +307,11 @@ impl GpuStateVector {
             size_elements: _size_elements,
             num_samples: None,
             device_id: _device.ordinal(),
-            device_type: GpuDeviceType::Cuda,
+            device_type: NATIVE_GPU_DEVICE_TYPE,
         })
     }
 
-    #[cfg(not(target_os = "linux"))]
+    #[cfg(not(qdp_gpu_platform))]
     pub fn new(_device: &Arc<CudaDevice>, _qubits: usize, _precision: Precision) -> Result<Self> {
         Err(MahoutError::Cuda(
             "CUDA is only available on Linux. This build does not support GPU operations."
@@ -360,7 +368,7 @@ impl GpuStateVector {
             ))
         })?;
 
-        #[cfg(target_os = "linux")]
+        #[cfg(qdp_gpu_platform)]
         {
             let buffer = match precision {
                 Precision::Float32 => {
@@ -411,11 +419,11 @@ impl GpuStateVector {
                 size_elements: total_elements,
                 num_samples: Some(num_samples),
                 device_id: _device.ordinal(),
-                device_type: GpuDeviceType::Cuda,
+                device_type: NATIVE_GPU_DEVICE_TYPE,
             })
         }
 
-        #[cfg(not(target_os = "linux"))]
+        #[cfg(not(qdp_gpu_platform))]
         {
             Err(MahoutError::Cuda(
                 "CUDA is only available on Linux. This build does not support GPU operations."
@@ -434,7 +442,7 @@ impl GpuStateVector {
 
         match (self.precision(), target) {
             (Precision::Float32, Precision::Float64) => {
-                #[cfg(target_os = "linux")]
+                #[cfg(qdp_gpu_platform)]
                 {
                     let requested_bytes = self
                         .size_elements
@@ -502,7 +510,7 @@ impl GpuStateVector {
                     })
                 }
 
-                #[cfg(not(target_os = "linux"))]
+                #[cfg(not(qdp_gpu_platform))]
                 {
                     Err(MahoutError::Cuda(
                         "Precision conversion requires CUDA (Linux)".to_string(),
@@ -510,7 +518,7 @@ impl GpuStateVector {
                 }
             }
             (Precision::Float64, Precision::Float32) => {
-                #[cfg(target_os = "linux")]
+                #[cfg(qdp_gpu_platform)]
                 {
                     let requested_bytes = self
                         .size_elements
@@ -578,7 +586,7 @@ impl GpuStateVector {
                     })
                 }
 
-                #[cfg(not(target_os = "linux"))]
+                #[cfg(not(qdp_gpu_platform))]
                 {
                     Err(MahoutError::Cuda(
                         "Precision conversion requires CUDA (Linux)".to_string(),
@@ -597,13 +605,13 @@ impl GpuStateVector {
 /// Pinned Host Memory Buffer (owned allocation).
 ///
 /// Allocates page-locked memory to maximize H2D throughput in streaming IO paths.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub struct PinnedHostBuffer<T: Copy = f64> {
     ptr: *mut T,
     size_elements: usize,
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> PinnedHostBuffer<T> {
     /// Allocate pinned memory holding `elements` values of type `T`.
     pub fn new(elements: usize) -> Result<Self> {
@@ -658,7 +666,7 @@ impl<T: Copy> PinnedHostBuffer<T> {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl<T: Copy> Drop for PinnedHostBuffer<T> {
     fn drop(&mut self) {
         unsafe {
@@ -675,8 +683,8 @@ impl<T: Copy> Drop for PinnedHostBuffer<T> {
 }
 
 // Safety: Pinned memory is accessible from any thread
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 unsafe impl<T: Copy + Send> Send for PinnedHostBuffer<T> {}
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 unsafe impl<T: Copy + Sync> Sync for PinnedHostBuffer<T> {}
diff --git a/qdp/qdp-core/src/gpu/metrics.rs b/qdp/qdp-core/src/gpu/metrics.rs
index a7ce54e312..9482a5c0e9 100644
--- a/qdp/qdp-core/src/gpu/metrics.rs
+++ b/qdp/qdp-core/src/gpu/metrics.rs
@@ -21,12 +21,12 @@
 //! to the host and run on the CPU to produce a single scalar per sample.
 //! They are intended for **testing and validation**, not the hot path.
 
-#[cfg(target_os = "linux")]
-use cudarc::driver::CudaDevice;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use crate::gpu_rt::CudaDevice;
+#[cfg(qdp_gpu_platform)]
 use std::sync::Arc;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use qdp_kernels::{CuComplex, CuDoubleComplex};
 
 use crate::error::{MahoutError, Result};
@@ -152,7 +152,7 @@ pub fn trace_distance_cross_precision(state_f32: &[f32], state_f64: &[f64]) -> R
 /// Download f64 complex GPU data to host as interleaved (re, im) f64 vec.
 ///
 /// `gpu_ptr` must point to `num_elements` `CuDoubleComplex` values on device.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub fn download_complex_f64(
     device: &Arc<CudaDevice>,
     gpu_ptr: *const CuDoubleComplex,
@@ -168,15 +168,17 @@ pub fn download_complex_f64(
     let mut host_buf = vec![0.0_f64; num_elements * 2]; // interleaved re, im
 
     unsafe {
-        let ret = cudarc::driver::sys::lib().cuMemcpyDtoH_v2(
+        let ret = crate::gpu::cuda_ffi::cudaMemcpy(
             host_buf.as_mut_ptr() as *mut _,
-            gpu_ptr as u64,
+            gpu_ptr as *const _,
             byte_count,
+            crate::gpu::cuda_ffi::CUDA_MEMCPY_DEVICE_TO_HOST,
         );
-        if ret != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
+        if ret != crate::gpu::cuda_ffi::CUDA_SUCCESS {
             return Err(MahoutError::Cuda(format!(
-                "cuMemcpyDtoH failed during f64 download: {:?}",
-                ret
+                "device-to-host copy failed during f64 download: {} ({})",
+                ret,
+                crate::error::cuda_error_to_string(ret)
             )));
         }
     }
@@ -185,7 +187,7 @@ pub fn download_complex_f64(
 }
 
 /// Download f32 complex GPU data to host as interleaved (re, im) f32 vec.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub fn download_complex_f32(
     device: &Arc<CudaDevice>,
     gpu_ptr: *const CuComplex,
@@ -201,15 +203,17 @@ pub fn download_complex_f32(
     let mut host_buf = vec![0.0_f32; num_elements * 2];
 
     unsafe {
-        let ret = cudarc::driver::sys::lib().cuMemcpyDtoH_v2(
+        let ret = crate::gpu::cuda_ffi::cudaMemcpy(
             host_buf.as_mut_ptr() as *mut _,
-            gpu_ptr as u64,
+            gpu_ptr as *const _,
             byte_count,
+            crate::gpu::cuda_ffi::CUDA_MEMCPY_DEVICE_TO_HOST,
         );
-        if ret != cudarc::driver::sys::CUresult::CUDA_SUCCESS {
+        if ret != crate::gpu::cuda_ffi::CUDA_SUCCESS {
             return Err(MahoutError::Cuda(format!(
-                "cuMemcpyDtoH failed during f32 download: {:?}",
-                ret
+                "device-to-host copy failed during f32 download: {} ({})",
+                ret,
+                crate::error::cuda_error_to_string(ret)
             )));
         }
     }
diff --git a/qdp/qdp-core/src/gpu/mod.rs b/qdp/qdp-core/src/gpu/mod.rs
index 75c987c4f6..6fb62757d3 100644
--- a/qdp/qdp-core/src/gpu/mod.rs
+++ b/qdp/qdp-core/src/gpu/mod.rs
@@ -14,9 +14,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub mod buffer_pool;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) mod cuda_sync;
 pub mod encodings;
 pub mod memory;
@@ -24,22 +24,22 @@ pub mod memory;
 /// `tests/` can use them; not part of the supported runtime API.
 #[doc(hidden)]
 pub mod metrics;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub mod overlap_tracker;
 pub mod pipeline;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub mod pool_metrics;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) mod validation;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) mod cuda_ffi;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub use buffer_pool::{PinnedBufferHandle, PinnedBufferPool};
 pub use encodings::{AmplitudeEncoder, AngleEncoder, BasisEncoder, QuantumEncoder};
 pub use memory::GpuStateVector;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[doc(hidden)]
 pub use metrics::{download_complex_f32, download_complex_f64};
 #[doc(hidden)]
@@ -49,9 +49,9 @@ pub use metrics::{
 };
 pub use pipeline::run_dual_stream_pipeline;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub use overlap_tracker::OverlapTracker;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub use pipeline::PipelineContext;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub use pool_metrics::{PoolMetrics, PoolUtilizationReport};
diff --git a/qdp/qdp-core/src/gpu/overlap_tracker.rs b/qdp/qdp-core/src/gpu/overlap_tracker.rs
index c18dcfdfa3..78bcfe9be6 100644
--- a/qdp/qdp-core/src/gpu/overlap_tracker.rs
+++ b/qdp/qdp-core/src/gpu/overlap_tracker.rs
@@ -24,7 +24,7 @@ use crate::gpu::cuda_ffi::{
     CUDA_EVENT_DEFAULT, CUDA_SUCCESS, cudaEventCreateWithFlags, cudaEventDestroy,
     cudaEventElapsedTime, cudaEventRecord, cudaEventSynchronize,
 };
-use cudarc::driver::safe::CudaStream;
+use crate::gpu_rt::CudaStream;
 use std::ffi::c_void;
 
 /// Tracks overlap between H2D copy and compute operations using CUDA events.
diff --git a/qdp/qdp-core/src/gpu/pipeline.rs b/qdp/qdp-core/src/gpu/pipeline.rs
index 6d31cbdacd..15f0067f1e 100644
--- a/qdp/qdp-core/src/gpu/pipeline.rs
+++ b/qdp/qdp-core/src/gpu/pipeline.rs
@@ -24,34 +24,34 @@
 #![allow(unused_unsafe)]
 
 use crate::error::{MahoutError, Result};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::buffer_pool::{PinnedBufferHandle, PinnedBufferPool};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_ffi::{
     CUDA_EVENT_DISABLE_TIMING, CUDA_MEMCPY_HOST_TO_DEVICE, cudaEventCreateWithFlags,
     cudaEventDestroy, cudaEventRecord, cudaMemcpyAsync, cudaStreamWaitEvent,
 };
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::cuda_sync::sync_cuda_stream;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::memory::{ensure_device_memory_available, map_allocation_error};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::overlap_tracker::OverlapTracker;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::gpu::pool_metrics::PoolMetrics;
-use cudarc::driver::{CudaDevice, CudaSlice, DevicePtr, DeviceRepr, safe::CudaStream};
+use crate::gpu_rt::{CudaDevice, CudaSlice, CudaStream, DevicePtr, DeviceRepr};
 use std::ffi::c_void;
 use std::sync::Arc;
 
 /// Dual-stream context coordinating copy/compute with an event.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub struct PipelineContext {
     pub stream_compute: CudaStream,
     pub stream_copy: CudaStream,
     events_copy_done: Vec<*mut c_void>,
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn validate_event_slot(events: &[*mut c_void], slot: usize) -> Result<()> {
     if slot >= events.len() {
         return Err(MahoutError::InvalidInput(format!(
@@ -63,7 +63,7 @@ fn validate_event_slot(events: &[*mut c_void], slot: usize) -> Result<()> {
     Ok(())
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl PipelineContext {
     pub fn new(device: &Arc<CudaDevice>, event_slots: usize) -> Result<Self> {
         let stream_compute = device
@@ -188,7 +188,7 @@ impl PipelineContext {
     }
 }
 
-#[cfg(all(test, target_os = "linux"))]
+#[cfg(all(test, qdp_gpu_platform))]
 mod tests {
     use super::validate_event_slot;
 
@@ -207,7 +207,7 @@ mod tests {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 impl Drop for PipelineContext {
     fn drop(&mut self) {
         unsafe {
@@ -249,7 +249,7 @@ impl Drop for PipelineContext {
 ///     Ok(())
 /// })?;
 /// ```
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub fn run_dual_stream_pipeline<F>(
     device: &Arc<CudaDevice>,
     host_data: &[f64],
@@ -273,7 +273,7 @@ where
 ///
 /// `align_elements` must evenly divide the host data length and ensures chunks do not
 /// split logical records (e.g., per-sample data in batch encoding).
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(clippy::manual_is_multiple_of)]
 pub fn run_dual_stream_pipeline_aligned<F>(
     device: &Arc<CudaDevice>,
@@ -293,7 +293,7 @@ where
 }
 
 /// f32 variant of `run_dual_stream_pipeline_aligned`.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(clippy::manual_is_multiple_of)]
 pub fn run_dual_stream_pipeline_aligned_f32<F>(
     device: &Arc<CudaDevice>,
@@ -312,7 +312,7 @@ where
     )
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(clippy::manual_is_multiple_of)]
 fn run_dual_stream_pipeline_aligned_typed<T, F>(
     device: &Arc<CudaDevice>,
@@ -354,7 +354,7 @@ where
     )
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn run_dual_stream_pipeline_with_chunk_size<T, F>(
     device: &Arc<CudaDevice>,
     host_data: &[T],
diff --git a/qdp/qdp-core/src/gpu/validation.rs b/qdp/qdp-core/src/gpu/validation.rs
index d7968599a8..ba56fdbbdf 100644
--- a/qdp/qdp-core/src/gpu/validation.rs
+++ b/qdp/qdp-core/src/gpu/validation.rs
@@ -23,7 +23,7 @@
 #![allow(unused_unsafe)]
 
 use crate::error::{MahoutError, Result, cuda_error_to_string};
-use cudarc::driver::{CudaDevice, CudaSlice, DevicePtrMut};
+use crate::gpu_rt::{CudaDevice, CudaSlice, DevicePtrMut};
 use std::ffi::c_void;
 use std::sync::Arc;
 
diff --git a/qdp/qdp-core/src/gpu_rt.rs b/qdp/qdp-core/src/gpu_rt.rs
new file mode 100644
index 0000000000..7f7f1048aa
--- /dev/null
+++ b/qdp/qdp-core/src/gpu_rt.rs
@@ -0,0 +1,27 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Single import point for the device runtime types, vendor-selected at
+//! compile time. `cudarc` has no ROCm backend, so on the `hip` build these
+//! names resolve to the HIP shim in `qdp_kernels::device` instead of cudarc.
+//! Both expose the same type names and method signatures, so call sites and
+//! integration tests use `crate::gpu_rt::{...}` (or `qdp_core::gpu_rt::{...}`)
+//! and compile unchanged on either vendor.
+
+pub use qdp_kernels::device::{
+    CudaDevice, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, DeviceRepr, DeviceSlice,
+    ValidAsZeroBits,
+};
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 822fba96e1..4cfb4f079e 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -17,12 +17,18 @@
 // Allow unused_unsafe: CUDA FFI and kernel functions are unsafe in CUDA builds but safe stubs in no-CUDA builds.
 // The compiler can't statically determine which path is taken.
 #![allow(unused_unsafe)]
+// The `stream.stream as *mut c_void` casts at kernel-launch call sites are a
+// real conversion on the CUDA backend (cudarc's CUstream is a distinct pointer
+// type) but a no-op on HIP, where the stream field is already *mut c_void. Keep
+// the cast for backend-agnostic call sites and silence the HIP-only redundancy.
+#![cfg_attr(feature = "hip", allow(clippy::unnecessary_cast))]
 
 pub mod dlpack;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 mod encoding;
 pub mod error;
 pub mod gpu;
+pub mod gpu_rt;
 pub mod io;
 mod platform;
 pub mod preprocessing;
@@ -41,10 +47,10 @@ pub use reader::{FloatElem, NullHandling, handle_float32_nulls, handle_float64_n
 pub use types::{Dtype, Encoding};
 
 // Throughput/latency pipeline runner: single path using QdpEngine and encode_batch in Rust.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 mod pipeline_runner;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub use pipeline_runner::{
     PipelineConfig, PipelineIterator, PipelineRunResult, run_latency_pipeline,
     run_throughput_pipeline,
@@ -54,9 +60,9 @@ use std::ffi::c_void;
 use std::sync::Arc;
 
 use crate::dlpack::DLManagedTensor;
-use cudarc::driver::CudaDevice;
+use crate::gpu_rt::CudaDevice;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn validate_cuda_input_ptr(device: &CudaDevice, ptr: *const c_void) -> Result<()> {
     use crate::gpu::cuda_ffi::{
         CUDA_MEMORY_TYPE_DEVICE, CUDA_MEMORY_TYPE_MANAGED, CudaPointerAttributes,
@@ -179,7 +185,7 @@ impl QdpEngine {
 
     /// Block until all GPU work on the default stream has completed.
     /// Used by the generic pipeline and other callers that need to sync before timing.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub fn synchronize(&self) -> Result<()> {
         self.device
             .synchronize()
@@ -289,7 +295,7 @@ impl QdpEngine {
     /// * `host_data` - 1D input data (e.g. single sample for amplitude)
     /// * `num_qubits` - Number of qubits
     /// * `encoding_method` - Strategy (currently only "amplitude" supported for this path)
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub fn run_dual_stream_encode(
         &self,
         host_data: &[f64],
@@ -497,7 +503,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `input_len` elements of the expected dtype
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr(
         &self,
         input_d: *const std::ffi::c_void,
@@ -524,7 +530,7 @@ impl QdpEngine {
     /// # Safety
     /// Same as [`encode_from_gpu_ptr`](Self::encode_from_gpu_ptr). Additionally, `stream` must
     /// be a valid CUDA stream on the same device as the engine, or null.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr_with_stream(
         &self,
         input_d: *const std::ffi::c_void,
@@ -575,7 +581,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `input_len` f32 elements
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr_f32(
         &self,
         input_d: *const f32,
@@ -600,7 +606,7 @@ impl QdpEngine {
     /// # Safety
     /// In addition to the `encode_from_gpu_ptr_f32` requirements, the stream pointer
     /// must remain valid for the duration of this call.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
         &self,
         input_d: *const f32,
@@ -640,7 +646,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `input_len` f32 elements
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_angle_from_gpu_ptr_f32(
         &self,
         input_d: *const f32,
@@ -662,7 +668,7 @@ impl QdpEngine {
     /// # Safety
     /// In addition to the `encode_angle_from_gpu_ptr_f32` requirements, the stream pointer
     /// must remain valid for the duration of this call.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_angle_from_gpu_ptr_f32_with_stream(
         &self,
         input_d: *const f32,
@@ -702,7 +708,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `num_samples * sample_size` f32 elements
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr_f32(
         &self,
         input_batch_d: *const f32,
@@ -726,7 +732,7 @@ impl QdpEngine {
     /// # Safety
     /// In addition to the `encode_batch_from_gpu_ptr_f32` requirements, the stream pointer
     /// must remain valid for the duration of this call.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
         &self,
         input_batch_d: *const f32,
@@ -774,7 +780,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `num_samples * sample_size` f32 elements
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_angle_batch_from_gpu_ptr_f32(
         &self,
         input_batch_d: *const f32,
@@ -798,7 +804,7 @@ impl QdpEngine {
     /// # Safety
     /// In addition to the `encode_angle_batch_from_gpu_ptr_f32` requirements, the stream pointer
     /// must remain valid for the duration of this call.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_angle_batch_from_gpu_ptr_f32_with_stream(
         &self,
         input_batch_d: *const f32,
@@ -845,7 +851,7 @@ impl QdpEngine {
     /// The input pointer must:
     /// - Point to one valid f32 in GPU memory on the same device as the engine
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_basis_from_gpu_ptr_f32(
         &self,
         input_d: *const f32,
@@ -866,7 +872,7 @@ impl QdpEngine {
     /// # Safety
     /// In addition to the `encode_basis_from_gpu_ptr_f32` requirements, the
     /// stream pointer must remain valid for the duration of this call.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_basis_from_gpu_ptr_f32_with_stream(
         &self,
         input_d: *const f32,
@@ -903,7 +909,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `num_samples` f32 elements
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_basis_batch_from_gpu_ptr_f32(
         &self,
         input_batch_d: *const f32,
@@ -928,7 +934,7 @@ impl QdpEngine {
     /// # Safety
     /// In addition to the `encode_basis_batch_from_gpu_ptr_f32` requirements,
     /// the stream pointer must remain valid for the duration of this call.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_basis_batch_from_gpu_ptr_f32_with_stream(
         &self,
         input_batch_d: *const f32,
@@ -985,7 +991,7 @@ impl QdpEngine {
     /// - Point to valid GPU memory on the same device as the engine
     /// - Contain at least `num_samples * sample_size` elements of the expected dtype
     /// - Remain valid for the duration of this call
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr(
         &self,
         input_batch_d: *const std::ffi::c_void,
@@ -1014,7 +1020,7 @@ impl QdpEngine {
     /// # Safety
     /// Same as [`encode_batch_from_gpu_ptr`](Self::encode_batch_from_gpu_ptr). Additionally,
     /// `stream` must be a valid CUDA stream on the same device as the engine, or null.
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     pub unsafe fn encode_batch_from_gpu_ptr_with_stream(
         &self,
         input_batch_d: *const std::ffi::c_void,
diff --git a/qdp/qdp-core/src/pipeline_runner.rs b/qdp/qdp-core/src/pipeline_runner.rs
index 71b95d5d15..ef8e130a7e 100644
--- a/qdp/qdp-core/src/pipeline_runner.rs
+++ b/qdp/qdp-core/src/pipeline_runner.rs
@@ -836,7 +836,7 @@ pub fn run_throughput_pipeline(config: &PipelineConfig) -> Result<PipelineRunRes
 
     let _ = producer_handle.join();
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     engine.synchronize()?;
 
     let duration_sec = start.elapsed().as_secs_f64().max(1e-9);
diff --git a/qdp/qdp-core/src/platform/mod.rs b/qdp/qdp-core/src/platform/mod.rs
index 85c123571a..53c949f632 100644
--- a/qdp/qdp-core/src/platform/mod.rs
+++ b/qdp/qdp-core/src/platform/mod.rs
@@ -14,18 +14,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#[cfg(not(target_os = "linux"))]
+#[cfg(not(qdp_gpu_platform))]
 mod fallback;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 mod linux;
-#[cfg(not(any(target_os = "linux", target_os = "windows")))]
+#[cfg(not(any(qdp_gpu_platform, target_os = "windows")))]
 mod other;
-#[cfg(target_os = "windows")]
+// Windows non-GPU stub: used only on Windows without the hip feature.
+// When qdp_gpu_platform is set (Windows+hip), the linux module is used instead.
+#[cfg(all(target_os = "windows", not(qdp_gpu_platform)))]
 mod windows;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub(crate) use linux::encode_from_parquet;
-#[cfg(not(any(target_os = "linux", target_os = "windows")))]
+#[cfg(not(any(qdp_gpu_platform, target_os = "windows")))]
 pub(crate) use other::encode_from_parquet;
-#[cfg(target_os = "windows")]
+#[cfg(all(target_os = "windows", not(qdp_gpu_platform)))]
 pub(crate) use windows::encode_from_parquet;
diff --git a/qdp/qdp-core/tests/common/mod.rs b/qdp/qdp-core/tests/common/mod.rs
index 7114ad613a..9580de5fb5 100644
--- a/qdp/qdp-core/tests/common/mod.rs
+++ b/qdp/qdp-core/tests/common/mod.rs
@@ -14,7 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use std::sync::Arc;
 
 use arrow::array::{FixedSizeListArray, Float64Array};
@@ -23,11 +23,11 @@ use arrow::record_batch::RecordBatch;
 use parquet::arrow::ArrowWriter;
 use parquet::file::properties::WriterProperties;
 
-#[cfg(target_os = "linux")]
-use cudarc::driver::{CudaDevice, CudaSlice};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use qdp_core::dlpack::DLManagedTensor;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use qdp_core::gpu_rt::{CudaDevice, CudaSlice};
+#[cfg(qdp_gpu_platform)]
 use qdp_core::{Precision, QdpEngine};
 
 /// Creates normalized test data (f64)
@@ -82,28 +82,28 @@ pub fn write_fixed_size_list_parquet(path: &str, data: &[f64], sample_size: usiz
 }
 
 /// Returns a CUDA device handle, or `None` when CUDA is unavailable for the test environment.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub fn cuda_device() -> Option<Arc<CudaDevice>> {
     CudaDevice::new(0).ok()
 }
 
 /// Returns a QDP engine, or `None` when GPU-backed engine initialization is unavailable.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub fn qdp_engine() -> Option<QdpEngine> {
     QdpEngine::new(0).ok()
 }
 
 /// Returns a QDP engine with the requested precision, or `None` when unavailable.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub fn qdp_engine_with_precision(precision: Precision) -> Option<QdpEngine> {
     QdpEngine::new_with_precision(0, precision).ok()
 }
 
 /// Copies f64 host data to the default CUDA device, or returns `None` when unavailable.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub fn copy_f64_to_device(data: &[f64]) -> Option<(Arc<CudaDevice>, CudaSlice<f64>)> {
     let device = cuda_device()?;
@@ -112,7 +112,7 @@ pub fn copy_f64_to_device(data: &[f64]) -> Option<(Arc<CudaDevice>, CudaSlice<f6
 }
 
 /// Copies f32 host data to the default CUDA device, or returns `None` when unavailable.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub fn copy_f32_to_device(data: &[f32]) -> Option<(Arc<CudaDevice>, CudaSlice<f32>)> {
     let device = cuda_device()?;
@@ -121,7 +121,7 @@ pub fn copy_f32_to_device(data: &[f32]) -> Option<(Arc<CudaDevice>, CudaSlice<f3
 }
 
 /// Copies usize host data to the default CUDA device, or returns `None` when unavailable.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub fn copy_usize_to_device(data: &[usize]) -> Option<(Arc<CudaDevice>, CudaSlice<usize>)> {
     let device = cuda_device()?;
@@ -130,7 +130,7 @@ pub fn copy_usize_to_device(data: &[usize]) -> Option<(Arc<CudaDevice>, CudaSlic
 }
 
 /// Asserts a DLPack tensor is 2D with the expected shape.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub unsafe fn assert_dlpack_shape_2d(dlpack_ptr: *mut DLManagedTensor, dim0: i64, dim1: i64) {
     assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null");
@@ -144,7 +144,7 @@ pub unsafe fn assert_dlpack_shape_2d(dlpack_ptr: *mut DLManagedTensor, dim0: i64
 }
 
 /// Asserts a DLPack tensor is 2D with the expected shape and then frees it via its deleter.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub unsafe fn assert_dlpack_shape_2d_and_delete(
     dlpack_ptr: *mut DLManagedTensor,
@@ -157,7 +157,7 @@ pub unsafe fn assert_dlpack_shape_2d_and_delete(
 }
 
 /// Takes the DLPack deleter from the managed tensor and invokes it exactly once.
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[allow(dead_code)]
 pub unsafe fn take_deleter_and_delete(dlpack_ptr: *mut DLManagedTensor) {
     assert!(!dlpack_ptr.is_null(), "DLPack pointer should not be null");
diff --git a/qdp/qdp-core/tests/gpu_angle_encoding.rs b/qdp/qdp-core/tests/gpu_angle_encoding.rs
index 874e948c9e..4c3d48cc44 100644
--- a/qdp/qdp-core/tests/gpu_angle_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_angle_encoding.rs
@@ -20,7 +20,7 @@
 // tests inside encoding/angle.rs because they cannot be triggered via a Parquet
 // file.
 
-#![cfg(target_os = "linux")]
+#![cfg(qdp_gpu_platform)]
 
 use qdp_core::MahoutError;
 
diff --git a/qdp/qdp-core/tests/gpu_api_workflow.rs b/qdp/qdp-core/tests/gpu_api_workflow.rs
index 6c8e651669..47d18b79b7 100644
--- a/qdp/qdp-core/tests/gpu_api_workflow.rs
+++ b/qdp/qdp-core/tests/gpu_api_workflow.rs
@@ -16,16 +16,16 @@
 
 // API workflow tests: Engine initialization and encoding
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use qdp_core::MahoutError;
 use qdp_core::QdpEngine;
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use qdp_core::gpu::pipeline::run_dual_stream_pipeline_aligned;
 
 mod common;
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_engine_initialization() {
     println!("Testing QdpEngine initialization...");
 
@@ -46,7 +46,7 @@ fn test_engine_initialization() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encoding_workflow() {
     println!("Testing amplitude encoding workflow...");
 
@@ -72,7 +72,7 @@ fn test_amplitude_encoding_workflow() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encoding_async_pipeline() {
     println!("Testing amplitude encoding async pipeline path...");
 
@@ -98,7 +98,7 @@ fn test_amplitude_encoding_async_pipeline() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encoding_async_pipeline() {
     println!("Testing angle encoding async pipeline path...");
 
@@ -125,7 +125,7 @@ fn test_angle_encoding_async_pipeline() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_async_alignment_error() {
     println!("Testing angle async pipeline alignment error...");
 
@@ -153,7 +153,7 @@ fn test_angle_async_alignment_error() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_batch_dlpack_2d_shape() {
     println!("Testing batch DLPack 2D shape...");
 
@@ -211,7 +211,7 @@ fn test_batch_dlpack_2d_shape() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_single_encode_dlpack_2d_shape() {
     println!("Testing single encode returns 2D shape...");
 
@@ -249,7 +249,7 @@ fn test_single_encode_dlpack_2d_shape() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_dlpack_device_id() {
     println!("Testing DLPack device_id propagation...");
 
@@ -275,12 +275,18 @@ fn test_dlpack_device_id() {
             "device_id should be 0 for device 0"
         );
 
-        // Verify device_type is CUDA (kDLCUDA = 2)
+        // Verify device_type matches the build's backend: kDLCUDA (2) on the
+        // NVIDIA build, kDLROCM (10) on the HIP build (a ROCm PyTorch's
+        // from_dlpack requires the ROCm tag).
         use qdp_core::dlpack::DLDeviceType;
-        match tensor.device.device_type {
-            DLDeviceType::kDLCUDA => println!("PASS: Device type is CUDA"),
-            _ => panic!("Expected CUDA device type"),
-        }
+        #[cfg(not(feature = "hip"))]
+        let expected = DLDeviceType::kDLCUDA;
+        #[cfg(feature = "hip")]
+        let expected = DLDeviceType::kDLROCM;
+        assert_eq!(
+            tensor.device.device_type, expected,
+            "DLPack device_type should match the build backend"
+        );
 
         println!(
             "PASS: DLPack device_id correctly set to {}",
diff --git a/qdp/qdp-core/tests/gpu_basis_encoding.rs b/qdp/qdp-core/tests/gpu_basis_encoding.rs
index e8e9244ce9..1dfb545477 100644
--- a/qdp/qdp-core/tests/gpu_basis_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_basis_encoding.rs
@@ -19,7 +19,7 @@
 // pipeline path.  The needs_staging_copy() unit test lives inside encoding/basis.rs
 // because BasisEncoder is pub(crate) and not accessible from here.
 
-#![cfg(target_os = "linux")]
+#![cfg(qdp_gpu_platform)]
 
 use qdp_core::MahoutError;
 
diff --git a/qdp/qdp-core/tests/gpu_dlpack.rs b/qdp/qdp-core/tests/gpu_dlpack.rs
index f84bcd38ef..439e3e5cfc 100644
--- a/qdp/qdp-core/tests/gpu_dlpack.rs
+++ b/qdp/qdp-core/tests/gpu_dlpack.rs
@@ -71,7 +71,7 @@ mod dlpack_tests {
     }
 
     #[test]
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn test_dlpack_single_shape_f32() {
         let Some(device) = common::cuda_device() else {
             return;
@@ -97,7 +97,7 @@ mod dlpack_tests {
     }
 
     #[test]
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn test_dlpack_batch_shape_f32() {
         let Some(device) = common::cuda_device() else {
             return;
@@ -142,7 +142,7 @@ mod dlpack_tests {
 
     /// synchronize_stream(CUDA_STREAM_LEGACY) syncs the legacy default stream (Linux + CUDA).
     #[test]
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     fn test_synchronize_stream_legacy() {
         if common::cuda_device().is_none() {
             return;
diff --git a/qdp/qdp-core/tests/gpu_fidelity.rs b/qdp/qdp-core/tests/gpu_fidelity.rs
index cd38a5c97c..8ef39482b1 100644
--- a/qdp/qdp-core/tests/gpu_fidelity.rs
+++ b/qdp/qdp-core/tests/gpu_fidelity.rs
@@ -17,7 +17,7 @@
 //! Tests for fidelity / trace-distance metrics and F32 vs F64 precision
 //! comparison across different qubit counts.
 
-#![cfg(target_os = "linux")]
+#![cfg(qdp_gpu_platform)]
 
 use approx::assert_relative_eq;
 use qdp_core::gpu::metrics::{
diff --git a/qdp/qdp-core/tests/gpu_iqp_encoding.rs b/qdp/qdp-core/tests/gpu_iqp_encoding.rs
index 4954ab5b38..c82e292dc1 100644
--- a/qdp/qdp-core/tests/gpu_iqp_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_iqp_encoding.rs
@@ -36,7 +36,7 @@ fn iqp_z_data_len(num_qubits: usize) -> usize {
 // =============================================================================
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_zero_qubits_rejected() {
     println!("Testing IQP zero qubits rejection...");
 
@@ -61,7 +61,7 @@ fn test_iqp_zero_qubits_rejected() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_max_qubits_exceeded() {
     println!("Testing IQP max qubits (>{MAX_QUBITS}) rejection...");
 
@@ -87,7 +87,7 @@ fn test_iqp_max_qubits_exceeded() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_wrong_data_length() {
     println!("Testing IQP wrong data length rejection...");
 
@@ -124,7 +124,7 @@ fn test_iqp_wrong_data_length() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_z_wrong_data_length() {
     println!("Testing IQP-Z wrong data length rejection...");
 
@@ -153,7 +153,7 @@ fn test_iqp_z_wrong_data_length() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_nan_value_rejected() {
     println!("Testing IQP NaN value rejection...");
 
@@ -181,7 +181,7 @@ fn test_iqp_nan_value_rejected() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_infinity_value_rejected() {
     println!("Testing IQP infinity value rejection...");
 
@@ -213,7 +213,7 @@ fn test_iqp_infinity_value_rejected() {
 // =============================================================================
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_full_encoding_workflow() {
     println!("Testing IQP full encoding workflow...");
 
@@ -259,7 +259,7 @@ fn test_iqp_full_encoding_workflow() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_z_encoding_workflow() {
     println!("Testing IQP-Z encoding workflow...");
 
@@ -304,7 +304,7 @@ fn test_iqp_z_encoding_workflow() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_single_qubit() {
     println!("Testing IQP single qubit encoding...");
 
@@ -344,7 +344,7 @@ fn test_iqp_single_qubit() {
 // =============================================================================
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_batch_encoding() {
     println!("Testing IQP batch encoding...");
 
@@ -394,7 +394,7 @@ fn test_iqp_batch_encoding() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_z_batch_encoding() {
     println!("Testing IQP-Z batch encoding...");
 
@@ -444,7 +444,7 @@ fn test_iqp_z_batch_encoding() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_batch_wrong_sample_size() {
     println!("Testing IQP batch wrong sample_size rejection...");
 
@@ -480,7 +480,7 @@ fn test_iqp_batch_wrong_sample_size() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_batch_data_length_mismatch() {
     println!("Testing IQP batch data length mismatch rejection...");
 
@@ -511,7 +511,7 @@ fn test_iqp_batch_data_length_mismatch() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_batch_nan_in_sample() {
     println!("Testing IQP batch NaN value rejection...");
 
@@ -571,7 +571,7 @@ fn test_iqp_data_length_calculations() {
 // =============================================================================
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_fwt_threshold_boundary() {
     println!("Testing IQP FWT threshold boundary (n=4, where FWT kicks in)...");
 
@@ -614,7 +614,7 @@ fn test_iqp_fwt_threshold_boundary() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_fwt_larger_qubit_counts() {
     println!("Testing IQP FWT with larger qubit counts (n=5,6,7,8)...");
 
@@ -660,7 +660,7 @@ fn test_iqp_fwt_larger_qubit_counts() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_z_fwt_correctness() {
     println!("Testing IQP-Z FWT correctness for various qubit counts...");
 
@@ -702,7 +702,7 @@ fn test_iqp_z_fwt_correctness() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_fwt_batch_various_sizes() {
     println!("Testing IQP FWT batch encoding with various qubit counts...");
 
@@ -752,7 +752,7 @@ fn test_iqp_fwt_batch_various_sizes() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_fwt_zero_parameters_identity() {
     println!("Testing IQP FWT with zero parameters produces |0⟩ state...");
 
@@ -795,7 +795,7 @@ fn test_iqp_fwt_zero_parameters_identity() {
 // =============================================================================
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_encoder_via_factory() {
     println!("Testing IQP encoder creation via Encoding::from_str_ci / encode...");
 
@@ -834,7 +834,7 @@ fn test_iqp_encoder_via_factory() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_iqp_z_encoder_via_factory() {
     println!("Testing IQP-Z encoder creation via encode...");
 
diff --git a/qdp/qdp-core/tests/gpu_memory_safety.rs b/qdp/qdp-core/tests/gpu_memory_safety.rs
index 33e937a37d..fa81972589 100644
--- a/qdp/qdp-core/tests/gpu_memory_safety.rs
+++ b/qdp/qdp-core/tests/gpu_memory_safety.rs
@@ -21,7 +21,7 @@ use qdp_core::Precision;
 mod common;
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_memory_pressure() {
     println!("Testing memory pressure (leak detection)");
     println!("Running 100 iterations of encode + free");
@@ -51,7 +51,7 @@ fn test_memory_pressure() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_multiple_concurrent_states() {
     println!("Testing multiple concurrent state vectors...");
 
@@ -81,7 +81,7 @@ fn test_multiple_concurrent_states() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_dlpack_tensor_metadata_default() {
     println!("Testing DLPack tensor metadata...");
 
@@ -131,7 +131,7 @@ fn test_dlpack_tensor_metadata_default() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_dlpack_tensor_metadata_f64() {
     println!("Testing DLPack tensor metadata...");
 
diff --git a/qdp/qdp-core/tests/gpu_norm_f32.rs b/qdp/qdp-core/tests/gpu_norm_f32.rs
index af1150881b..294d093bdd 100644
--- a/qdp/qdp-core/tests/gpu_norm_f32.rs
+++ b/qdp/qdp-core/tests/gpu_norm_f32.rs
@@ -18,11 +18,11 @@
 // Tests for GPU-side f32 L2 norm helper in AmplitudeEncoder.
 //
 
-#![cfg(target_os = "linux")]
+#![cfg(qdp_gpu_platform)]
 
 use approx::assert_relative_eq;
-use cudarc::driver::DevicePtr;
 use qdp_core::gpu::encodings::amplitude::AmplitudeEncoder;
+use qdp_core::gpu_rt::DevicePtr;
 
 mod common;
 
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 48ba65f847..b5fed7c7fa 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -16,9 +16,13 @@
 
 // Unit and integration tests for encode_from_gpu_ptr and encode_batch_from_gpu_ptr.
 
-#![cfg(target_os = "linux")]
+#![cfg(qdp_gpu_platform)]
+// The `stream.stream as *mut c_void` cast is a real conversion on the CUDA
+// backend (cudarc's CUstream is a distinct pointer type) but a no-op on HIP,
+// where the stream field is already *mut c_void; silence the HIP-only redundancy.
+#![cfg_attr(feature = "hip", allow(clippy::unnecessary_cast))]
 
-use cudarc::driver::{DevicePtr, DeviceSlice};
+use qdp_core::gpu_rt::{DevicePtr, DeviceSlice};
 use qdp_core::{MahoutError, Precision, QdpEngine};
 use std::ffi::c_void;
 
diff --git a/qdp/qdp-core/tests/gpu_validation.rs b/qdp/qdp-core/tests/gpu_validation.rs
index 291f92dce8..4f0d1a879e 100644
--- a/qdp/qdp-core/tests/gpu_validation.rs
+++ b/qdp/qdp-core/tests/gpu_validation.rs
@@ -22,7 +22,7 @@ use qdp_core::gpu::encodings::MAX_QUBITS;
 mod common;
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_input_validation_invalid_strategy() {
     println!("Testing invalid strategy name rejection...");
 
@@ -48,7 +48,7 @@ fn test_input_validation_invalid_strategy() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_input_validation_qubit_mismatch() {
     println!("Testing qubit size validation...");
 
@@ -78,7 +78,7 @@ fn test_input_validation_qubit_mismatch() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_input_validation_zero_qubits() {
     println!("Testing zero qubits rejection...");
 
@@ -104,7 +104,7 @@ fn test_input_validation_zero_qubits() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_input_validation_max_qubits() {
     println!("Testing maximum qubit limit ({MAX_QUBITS})...");
 
@@ -130,7 +130,7 @@ fn test_input_validation_max_qubits() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_input_validation_batch_zero_samples() {
     println!("Testing zero num_samples rejection...");
 
@@ -155,7 +155,7 @@ fn test_input_validation_batch_zero_samples() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_empty_data() {
     println!("Testing empty data rejection...");
 
@@ -178,7 +178,7 @@ fn test_empty_data() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_zero_norm_data() {
     println!("Testing zero-norm data rejection...");
 
@@ -210,7 +210,7 @@ fn test_error_types() {
 }
 
 #[test]
-#[cfg(not(target_os = "linux"))]
+#[cfg(not(qdp_gpu_platform))]
 fn test_non_linux_graceful_failure() {
     let result = QdpEngine::new(0);
     assert!(result.is_err());
diff --git a/qdp/qdp-kernels/Cargo.toml b/qdp/qdp-kernels/Cargo.toml
index dcc7c0ec05..6da30e0737 100644
--- a/qdp/qdp-kernels/Cargo.toml
+++ b/qdp/qdp-kernels/Cargo.toml
@@ -4,11 +4,21 @@ version.workspace = true
 edition.workspace = true
 
 [dependencies]
-cudarc = { workspace = true }
+cudarc = { workspace = true, optional = true }
 
 [build-dependencies]
 cc = { workspace = true }
 
+[features]
+# `cuda` and `hip` are mutually exclusive; pick exactly one vendor backend.
+# If both end up enabled (e.g. via workspace feature unification), `hip` takes
+# precedence: kernels build for HIP and cudarc is compiled but unused.
+# Default build: NVIDIA CUDA via cudarc + nvcc-compiled kernels.
+default = ["cuda"]
+cuda = ["dep:cudarc"]
+# AMD build: hipcc-compiled kernels + the in-crate HIP device traits; no cudarc.
+hip = []
+
 [lib]
 name = "qdp_kernels"
 crate-type = ["rlib", "staticlib"]
diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs
index def59d6935..ae867e6b06 100644
--- a/qdp/qdp-kernels/build.rs
+++ b/qdp/qdp-kernels/build.rs
@@ -26,6 +26,20 @@
 use std::env;
 use std::process::Command;
 
+const KERNEL_SOURCES: &[&str] = &[
+    "src/amplitude.cu",
+    "src/basis.cu",
+    "src/angle.cu",
+    "src/validation.cu",
+    "src/iqp.cu",
+    "src/phase.cu",
+];
+
+/// Default AMD GPU target used only when QDP_HIP_ARCH_LIST is unset. Never hardcode
+/// this as the sole arch in a way that overrides the env list: other AMD targets
+/// (gfx1100, gfx1151) must build the same source by setting QDP_HIP_ARCH_LIST alone.
+const DEFAULT_HIP_ARCH: &str = "gfx90a";
+
 const DEFAULT_CUBIN_ARCHES: &[&str] = &["75", "80", "86", "89", "90", "100", "120"];
 const DEFAULT_PTX_CANDIDATES: &[&str] = &["120", "100", "90", "89", "86", "80", "75"];
 const LEGACY_FALLBACK_ARCHES: &[&str] = &["75", "80", "86"];
@@ -154,9 +168,122 @@ fn apply_default_arch_targets(build: &mut cc::Build) {
     }
 }
 
+fn qdp_use_hip_env() -> bool {
+    env::var("QDP_USE_HIP")
+        .map(|v| v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("yes"))
+        .unwrap_or(false)
+}
+
+fn hip_requested() -> bool {
+    cfg!(feature = "hip") || qdp_use_hip_env()
+}
+
+/// Reject a kernel/host backend mismatch before it produces a broken binary.
+///
+/// QDP_USE_HIP=1 flips the KERNEL build to hipcc, while the HOST runtime is
+/// chosen by the `hip` Cargo feature (cudarc when off, the HIP shim when on).
+/// Setting QDP_USE_HIP=1 with the `hip` feature OFF would build AMD device code
+/// (hipcc) against the cudarc host backend, which fails or misbehaves at runtime;
+/// reject that loudly. The reverse (the `hip` feature on, QDP_USE_HIP unset) is
+/// NOT a mismatch: `hip_requested()` already builds the kernels for HIP whenever
+/// the feature is on, so the host and kernels agree -- no panic there.
+fn check_hip_consistency() {
+    let env_hip = qdp_use_hip_env();
+    let feature_hip = env::var("CARGO_FEATURE_HIP").is_ok();
+    if env_hip && !feature_hip {
+        panic!(
+            "QDP_USE_HIP is set but the `hip` Cargo feature is off: this would \
+             build AMD kernels (hipcc) against the cudarc host backend. \
+             Add `--features hip` (and `--no-default-features` to drop cudarc)."
+        );
+    }
+}
+
+/// Compile the kernels with hipcc for AMD GPUs.
+///
+/// Mirrors the CUDA branch in spirit (same six .cu sources, same `src/` include
+/// for kernel_config.h) but: targets are AMD `--offload-arch` values from
+/// QDP_HIP_ARCH_LIST (default gfx90a only when unset), the hip_compat/ shim dir
+/// is added to the include path so the sources' `<cuda_runtime.h>` /
+/// `<cuComplex.h>` / `<vector_types.h>` resolve to HIP equivalents, and the
+/// link library is amdhip64 instead of cudart. The CUDA path is untouched.
+fn build_hip() {
+    let hipcc = env::var("QDP_HIPCC").unwrap_or_else(|_| "hipcc".to_string());
+
+    // Degrade gracefully when the ROCm toolchain is absent, mirroring the
+    // CUDA branch's nvcc-not-found path. This lets `cargo check`/`clippy`
+    // (including `--all-features`, which turns the `hip` feature on) succeed in
+    // a ROCm-less CI runner: emit the qdp_no_cuda stub cfg and skip compilation
+    // instead of letting hipcc fail the build.
+    let has_hipcc = Command::new(&hipcc).arg("--version").output().is_ok();
+    if !has_hipcc {
+        println!("cargo:rustc-cfg=qdp_no_cuda");
+        println!(
+            "cargo:warning=ROCm/hipcc not found ('{hipcc}' not runnable). Skipping kernel compilation."
+        );
+        println!(
+            "cargo:warning=This is expected in environments without the ROCm toolkit installed."
+        );
+        println!(
+            "cargo:warning=The project will build against host stubs, but GPU functionality will not be available."
+        );
+        return;
+    }
+
+    let mut build = cc::Build::new();
+    build.compiler(&hipcc);
+    build.cpp(true);
+    // hip_compat/ first so its cuda_runtime.h / cuComplex.h / vector_types.h win;
+    // src/ for kernel_config.h and kernel_compat.h.
+    build.include("hip_compat");
+    build.include("src");
+    build.flag("-std=c++17");
+    build.flag("-x").flag("hip");
+
+    let arch_list = env::var("QDP_HIP_ARCH_LIST").unwrap_or_else(|_| DEFAULT_HIP_ARCH.to_string());
+    let mut saw_arch = false;
+    for entry in arch_list.split(',') {
+        let arch = entry.trim();
+        if arch.is_empty() {
+            continue;
+        }
+        build.flag(format!("--offload-arch={arch}"));
+        saw_arch = true;
+    }
+    if !saw_arch {
+        build.flag(format!("--offload-arch={DEFAULT_HIP_ARCH}"));
+    }
+
+    for src in KERNEL_SOURCES {
+        build.file(src);
+    }
+    build.compile("kernels");
+
+    // Link the HIP runtime. Honor an explicit ROCM_PATH; otherwise rely on the
+    // default loader search path (hipcc-built objects pull libamdhip64 there).
+    if let Ok(rocm) = env::var("ROCM_PATH") {
+        println!("cargo:rustc-link-search=native={rocm}/lib");
+    }
+    println!("cargo:rustc-link-lib=amdhip64");
+}
+
 fn main() {
     // Let rustc know about our build-script-defined cfg flags (avoids `unexpected_cfgs` warnings).
     println!("cargo::rustc-check-cfg=cfg(qdp_no_cuda)");
+    // Emit qdp_gpu_platform when building for a GPU-capable OS (Linux always;
+    // Windows when the hip feature is on via QDP_USE_HIP=1 / TheRock ROCm).
+    println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)");
+
+    // Reject a kernel/host backend mismatch (QDP_USE_HIP vs the `hip` feature)
+    // before compiling anything.
+    check_hip_consistency();
+
+    let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux");
+    let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok();
+    let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows");
+    if is_linux || (is_windows && hip_feature) {
+        println!("cargo::rustc-cfg=qdp_gpu_platform");
+    }
 
     // Tell Cargo to rerun this script if the kernel sources change
     println!("cargo:rerun-if-changed=src/amplitude.cu");
@@ -167,7 +294,21 @@ fn main() {
     println!("cargo:rerun-if-changed=src/phase.cu");
     println!("cargo:rerun-if-env-changed=QDP_NO_CUDA");
     println!("cargo:rerun-if-env-changed=QDP_CUDA_ARCH_LIST");
+    println!("cargo:rerun-if-env-changed=QDP_USE_HIP");
+    println!("cargo:rerun-if-env-changed=QDP_HIP_ARCH_LIST");
+    println!("cargo:rerun-if-env-changed=QDP_HIPCC");
     println!("cargo:rerun-if-changed=src/kernel_config.h");
+    println!("cargo:rerun-if-changed=src/kernel_compat.h");
+    println!("cargo:rerun-if-changed=hip_compat/cuda_runtime.h");
+    println!("cargo:rerun-if-changed=hip_compat/cuComplex.h");
+    println!("cargo:rerun-if-changed=hip_compat/vector_types.h");
+
+    // AMD/HIP build path: compile the same .cu sources with hipcc. Gated by the
+    // `hip` Cargo feature or QDP_USE_HIP=1; the CUDA path below is unchanged when off.
+    if hip_requested() {
+        build_hip();
+        return;
+    }
 
     // Check if CUDA is available by looking for nvcc
     let force_no_cuda = env::var("QDP_NO_CUDA")
diff --git a/qdp/qdp-kernels/hip_compat/cuComplex.h b/qdp/qdp-kernels/hip_compat/cuComplex.h
new file mode 100644
index 0000000000..0859771e4a
--- /dev/null
+++ b/qdp/qdp-kernels/hip_compat/cuComplex.h
@@ -0,0 +1,46 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// HIP forwarding shim for <cuComplex.h> (HIP build path only; see
+// cuda_runtime.h in this directory for how it is selected). hipcc does not
+// ship a <cuComplex.h>; HIP's <hip/hip_complex.h> provides the same complex
+// layout and helpers under hip* names. The aliases below let the .cu sources
+// keep their cuComplex / cuDoubleComplex / make_cu* / cuC* spellings unchanged.
+
+#pragma once
+#include <hip/hip_complex.h>
+
+typedef hipDoubleComplex cuDoubleComplex;
+typedef hipFloatComplex  cuComplex;
+
+#define make_cuDoubleComplex make_hipDoubleComplex
+#define make_cuComplex       make_hipFloatComplex
+
+// The kernels call cuCreal/cuCimag/cuCadd/cuCsub only on cuDoubleComplex, so
+// alias to HIP's double-precision helpers (hipC*), not the float (hipC*f) set.
+#define cuCreal cuCreal_double
+#define cuCimag cuCimag_double
+#define cuCadd  cuCadd_double
+#define cuCsub  cuCsub_double
+#define cuCmul  cuCmul_double
+#define cuConj  cuConj_double
+
+static __host__ __device__ inline double cuCreal_double(hipDoubleComplex z) { return hipCreal(z); }
+static __host__ __device__ inline double cuCimag_double(hipDoubleComplex z) { return hipCimag(z); }
+static __host__ __device__ inline hipDoubleComplex cuCadd_double(hipDoubleComplex a, hipDoubleComplex b) { return hipCadd(a, b); }
+static __host__ __device__ inline hipDoubleComplex cuCsub_double(hipDoubleComplex a, hipDoubleComplex b) { return hipCsub(a, b); }
+static __host__ __device__ inline hipDoubleComplex cuCmul_double(hipDoubleComplex a, hipDoubleComplex b) { return hipCmul(a, b); }
+static __host__ __device__ inline hipDoubleComplex cuConj_double(hipDoubleComplex z) { return hipConj(z); }
diff --git a/qdp/qdp-kernels/hip_compat/cuda_runtime.h b/qdp/qdp-kernels/hip_compat/cuda_runtime.h
new file mode 100644
index 0000000000..144f1deb24
--- /dev/null
+++ b/qdp/qdp-kernels/hip_compat/cuda_runtime.h
@@ -0,0 +1,45 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// HIP forwarding shim for <cuda_runtime.h>.
+//
+// This file exists ONLY on the HIP build path: qdp-kernels/build.rs adds the
+// hip_compat/ directory to the include search path exclusively when compiling
+// with hipcc, so a CUDA build never sees it and pulls the real toolkit header
+// instead. The .cu sources keep their original `#include <cuda_runtime.h>`
+// spelling; this header maps the small set of cuda* runtime symbols the
+// kernels reference to their hip* equivalents (HIP error codes match CUDA's
+// numerically for these codes).
+
+#pragma once
+#include <hip/hip_runtime.h>
+
+// MSVC <math.h> does not define POSIX math constants unless _USE_MATH_DEFINES
+// is set before the first system include. Provide the one the kernels use.
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.7071067811865475244008443621
+#endif
+
+#define cudaError_t              hipError_t
+#define cudaSuccess              hipSuccess
+#define cudaErrorInvalidValue    hipErrorInvalidValue
+#define cudaStream_t             hipStream_t
+#define cudaGetLastError         hipGetLastError
+#define cudaGetDevice            hipGetDevice
+#define cudaDeviceGetAttribute   hipDeviceGetAttribute
+#define cudaDevAttrMaxGridDimX   hipDeviceAttributeMaxGridDimX
+#define cudaMemsetAsync          hipMemsetAsync
+#define cudaMalloc               hipMalloc
diff --git a/qdp/qdp-kernels/hip_compat/vector_types.h b/qdp/qdp-kernels/hip_compat/vector_types.h
new file mode 100644
index 0000000000..a48bbe8cd3
--- /dev/null
+++ b/qdp/qdp-kernels/hip_compat/vector_types.h
@@ -0,0 +1,23 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// HIP forwarding shim for <vector_types.h> (HIP build path only). HIP defines
+// double2 / float2 etc. via <hip/hip_runtime.h>, which is already pulled in by
+// the cuda_runtime.h shim in this directory, so this header only needs to
+// exist for the `#include <vector_types.h>` line to resolve.
+
+#pragma once
+#include <hip/hip_runtime.h>
diff --git a/qdp/qdp-kernels/src/amplitude.cu b/qdp/qdp-kernels/src/amplitude.cu
index 57fa4320cf..a944f2c72e 100644
--- a/qdp/qdp-kernels/src/amplitude.cu
+++ b/qdp/qdp-kernels/src/amplitude.cu
@@ -22,6 +22,7 @@
 #include <math.h>
 #include <stdint.h>
 #include "kernel_config.h"
+#include "kernel_compat.h"
 
 __global__ void amplitude_encode_kernel(
     const double* __restrict__ input,
@@ -99,7 +100,7 @@ __global__ void amplitude_encode_kernel_f32(
 // Warp-level reduction for sum using shuffle instructions
 __device__ __forceinline__ double warp_reduce_sum(double val) {
     for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
-        val += __shfl_down_sync(0xffffffff, val, offset);
+        val += __shfl_down_sync(QDP_FULL_WARP_MASK, val, offset);
     }
     return val;
 }
@@ -107,7 +108,7 @@ __device__ __forceinline__ double warp_reduce_sum(double val) {
 // Warp-level reduction for sum using shuffle instructions (float32)
 __device__ __forceinline__ float warp_reduce_sum_f32(float val) {
     for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
-        val += __shfl_down_sync(0xffffffff, val, offset);
+        val += __shfl_down_sync(QDP_FULL_WARP_MASK, val, offset);
     }
     return val;
 }
@@ -116,7 +117,10 @@ __device__ __forceinline__ float warp_reduce_sum_f32(float val) {
 __device__ __forceinline__ double block_reduce_sum(double val) {
     __shared__ double shared[32]; // supports up to 1024 threads (32 warps)
     int lane = threadIdx.x & (warpSize - 1);
-    int warp_id = threadIdx.x >> 5;
+    // warpSize is 32 on NVIDIA/RDNA and 64 on CDNA (gfx90a); derive the warp id
+    // from it rather than a hardcoded >> 5 so the per-warp partial lands in the
+    // slot the final reduction reads on every wave width.
+    int warp_id = threadIdx.x / warpSize;
 
     val = warp_reduce_sum(val);
     if (lane == 0) {
@@ -137,7 +141,10 @@ __device__ __forceinline__ double block_reduce_sum(double val) {
 __device__ __forceinline__ float block_reduce_sum_f32(float val) {
     __shared__ float shared[32]; // supports up to 1024 threads (32 warps)
     int lane = threadIdx.x & (warpSize - 1);
-    int warp_id = threadIdx.x >> 5;
+    // warpSize is 32 on NVIDIA/RDNA and 64 on CDNA (gfx90a); derive the warp id
+    // from it rather than a hardcoded >> 5 so the per-warp partial lands in the
+    // slot the final reduction reads on every wave width.
+    int warp_id = threadIdx.x / warpSize;
 
     val = warp_reduce_sum_f32(val);
     if (lane == 0) {
diff --git a/qdp/qdp-kernels/src/device.rs b/qdp/qdp-kernels/src/device.rs
new file mode 100644
index 0000000000..563e965deb
--- /dev/null
+++ b/qdp/qdp-kernels/src/device.rs
@@ -0,0 +1,443 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Device runtime surface, vendor-selected at compile time.
+//!
+//! `cudarc` is CUDA-only with no ROCm backend, so the AMD build cannot depend
+//! on it. This module is the seam: on the default (`cuda`) feature it simply
+//! re-exports the slice of `cudarc::driver` the crates use; on the `hip`
+//! feature it provides a thin HIP-runtime shim with the SAME type names and
+//! method signatures, so every call site (`device.alloc`, `htod_sync_copy`,
+//! `slice.device_ptr()`, ...) compiles unchanged on both vendors.
+//!
+//! The marker traits `DeviceRepr` / `ValidAsZeroBits` live here (not in
+//! qdp-core) because qdp-kernels implements them on its complex structs and is
+//! the lowest crate in the workspace.
+
+#[cfg(not(any(feature = "cuda", feature = "hip")))]
+compile_error!("qdp-kernels requires exactly one of the `cuda` or `hip` features");
+
+#[cfg(all(feature = "cuda", not(feature = "hip")))]
+pub use cudarc::driver::{
+    CudaDevice, CudaSlice, DevicePtr, DevicePtrMut, DeviceRepr, DeviceSlice, ValidAsZeroBits,
+    safe::CudaStream,
+};
+
+#[cfg(feature = "hip")]
+pub use hip::{
+    CudaDevice, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, DeviceRepr, DeviceSlice,
+    DriverError, ValidAsZeroBits,
+};
+
+#[cfg(feature = "hip")]
+mod hip {
+    use std::ffi::c_void;
+    use std::marker::PhantomData;
+    use std::sync::Arc;
+
+    // ---- HIP runtime FFI (subset used by the device abstraction) ----
+    // hip* names map 1:1 to the cuda* runtime entry points cudarc wraps; HIP
+    // error codes match CUDA's numerically for the codes we surface.
+    #[allow(non_camel_case_types)]
+    type hipError_t = i32;
+
+    const HIP_SUCCESS: hipError_t = 0;
+    const HIP_MEMCPY_HOST_TO_DEVICE: u32 = 1;
+    const HIP_MEMCPY_DEVICE_TO_HOST: u32 = 2;
+    // hipStreamNonBlocking: the new stream does not implicitly synchronize with
+    // the NULL/default stream, matching cudarc's fork_default_stream.
+    const HIP_STREAM_NON_BLOCKING: u32 = 1;
+
+    unsafe extern "C" {
+        fn hipSetDevice(device: i32) -> hipError_t;
+        fn hipGetDeviceCount(count: *mut i32) -> hipError_t;
+        fn hipMalloc(ptr: *mut *mut c_void, size: usize) -> hipError_t;
+        fn hipFree(ptr: *mut c_void) -> hipError_t;
+        fn hipMemset(ptr: *mut c_void, value: i32, size: usize) -> hipError_t;
+        fn hipMemcpy(dst: *mut c_void, src: *const c_void, size: usize, kind: u32) -> hipError_t;
+        fn hipDeviceSynchronize() -> hipError_t;
+        fn hipStreamCreateWithFlags(stream: *mut *mut c_void, flags: u32) -> hipError_t;
+        fn hipStreamDestroy(stream: *mut c_void) -> hipError_t;
+        fn hipStreamSynchronize(stream: *mut c_void) -> hipError_t;
+    }
+
+    /// Mirrors the role of `cudarc::driver::DriverError`: an opaque, `Debug`able
+    /// wrapper over a runtime status code. Call sites only ever `{:?}`-format it.
+    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
+    pub struct DriverError(pub hipError_t);
+
+    fn check(code: hipError_t) -> Result<(), DriverError> {
+        if code == HIP_SUCCESS {
+            Ok(())
+        } else {
+            Err(DriverError(code))
+        }
+    }
+
+    /// Synchronize the NULL/default stream so its prior work is ordered before any
+    /// other stream observes the affected buffers.
+    ///
+    /// The blocking shim copies (htod/alloc_zeros) issue hipMemcpy/hipMemset on the
+    /// default (NULL) stream. CUDA's legacy default stream is synchronizing, so on
+    /// NVIDIA that work is implicitly ordered before a kernel launched on a forked
+    /// non-blocking stream that reads the same buffer. HIP's default stream is NOT
+    /// synchronizing relative to a hipStreamNonBlocking stream, so without this an
+    /// encoder that sets up input/output on the default stream and then launches
+    /// the norm/encode kernels on the caller's forked stream would race the setup
+    /// (the kernel reads stale/zero data). A default-stream synchronize after the
+    /// blocking copy restores the CUDA-equivalent ordering while preserving the
+    /// dual-stream copy/compute overlap (which uses async copies on explicit
+    /// streams, not these blocking paths).
+    fn sync_default_stream() -> Result<(), DriverError> {
+        unsafe { check(hipStreamSynchronize(std::ptr::null_mut())) }
+    }
+
+    /// Marker: type is safe to byte-copy to/from the device. Mirrors
+    /// `cudarc::driver::DeviceRepr`.
+    ///
+    /// # Safety
+    /// Implementor must be `#[repr(C)]`/`#[repr(transparent)]` plain-old-data
+    /// with no padding that would expose uninitialized bytes.
+    pub unsafe trait DeviceRepr: Copy {}
+    unsafe impl DeviceRepr for f32 {}
+    unsafe impl DeviceRepr for f64 {}
+    unsafe impl DeviceRepr for i32 {}
+    unsafe impl DeviceRepr for u32 {}
+    unsafe impl DeviceRepr for usize {}
+
+    /// Marker: an all-zero bit pattern is a valid value (enables alloc_zeros).
+    /// Mirrors `cudarc::driver::ValidAsZeroBits`.
+    ///
+    /// # Safety
+    /// All-zero bytes must be a valid inhabitant of the type.
+    pub unsafe trait ValidAsZeroBits {}
+    unsafe impl ValidAsZeroBits for f32 {}
+    unsafe impl ValidAsZeroBits for f64 {}
+    unsafe impl ValidAsZeroBits for i32 {}
+    unsafe impl ValidAsZeroBits for u32 {}
+    unsafe impl ValidAsZeroBits for usize {}
+
+    /// Raw device-pointer accessors, matching cudarc's traits. The returned
+    /// reference is to the device address stored as `u64`, so the existing
+    /// `*slice.device_ptr() as *mut T` call sites work verbatim.
+    pub trait DevicePtr<T> {
+        fn device_ptr(&self) -> &u64;
+    }
+    pub trait DevicePtrMut<T> {
+        fn device_ptr_mut(&mut self) -> &mut u64;
+    }
+    /// Length accessor, matching cudarc's `DeviceSlice`.
+    pub trait DeviceSlice<T> {
+        fn len(&self) -> usize;
+        fn is_empty(&self) -> bool {
+            self.len() == 0
+        }
+    }
+
+    /// Owned device allocation; frees on drop. Stand-in for `cudarc::CudaSlice`.
+    pub struct CudaSlice<T> {
+        ptr: u64,
+        len: usize,
+        _device: Arc<CudaDevice>,
+        _marker: PhantomData<T>,
+    }
+
+    // The device address is just an integer; ownership/lifetime is enforced by
+    // the held Arc<CudaDevice>. Safe to move across threads like cudarc's slice.
+    unsafe impl<T: Send> Send for CudaSlice<T> {}
+    unsafe impl<T: Sync> Sync for CudaSlice<T> {}
+
+    impl<T> CudaSlice<T> {
+        fn raw_ptr(&self) -> *mut c_void {
+            self.ptr as *mut c_void
+        }
+
+        /// Mutable sub-view `[range.start, range.end)`. Mirrors
+        /// `cudarc::CudaSlice::slice_mut`; the returned view borrows this slice
+        /// and is itself a `DevicePtrMut`/`DeviceSlice` copy target.
+        pub fn slice_mut(&mut self, range: std::ops::Range<usize>) -> CudaViewMut<'_, T> {
+            assert!(
+                range.start <= range.end && range.end <= self.len,
+                "slice_mut out of bounds"
+            );
+            let offset_ptr = self.ptr + (range.start * std::mem::size_of::<T>()) as u64;
+            CudaViewMut {
+                ptr: offset_ptr,
+                len: range.end - range.start,
+                _parent: PhantomData,
+            }
+        }
+    }
+
+    /// Borrowed mutable view into a `CudaSlice`, returned by `slice_mut`.
+    pub struct CudaViewMut<'a, T> {
+        ptr: u64,
+        len: usize,
+        _parent: PhantomData<&'a mut T>,
+    }
+
+    impl<T> DevicePtr<T> for CudaViewMut<'_, T> {
+        fn device_ptr(&self) -> &u64 {
+            &self.ptr
+        }
+    }
+    impl<T> DevicePtrMut<T> for CudaViewMut<'_, T> {
+        fn device_ptr_mut(&mut self) -> &mut u64 {
+            &mut self.ptr
+        }
+    }
+    impl<T> DeviceSlice<T> for CudaViewMut<'_, T> {
+        fn len(&self) -> usize {
+            self.len
+        }
+    }
+
+    impl<T> DevicePtr<T> for CudaSlice<T> {
+        fn device_ptr(&self) -> &u64 {
+            &self.ptr
+        }
+    }
+    impl<T> DevicePtrMut<T> for CudaSlice<T> {
+        fn device_ptr_mut(&mut self) -> &mut u64 {
+            &mut self.ptr
+        }
+    }
+    impl<T> DeviceSlice<T> for CudaSlice<T> {
+        fn len(&self) -> usize {
+            self.len
+        }
+    }
+
+    impl<T> Drop for CudaSlice<T> {
+        fn drop(&mut self) {
+            if self.ptr != 0 {
+                // hipFree releases on the calling thread's current device, so
+                // re-bind the owning device first (cudarc does the same in Drop):
+                // on multi-GPU a different device may be current, which would
+                // otherwise free against the wrong device. Best-effort -- Drop
+                // cannot report an error, so a failed bind is swallowed.
+                let _ = self._device.bind();
+                unsafe {
+                    let _ = hipFree(self.raw_ptr());
+                }
+            }
+        }
+    }
+
+    /// A HIP stream. The public `stream` field mirrors cudarc's
+    /// `CudaStream { stream: sys::CUstream, .. }` so existing call sites that do
+    /// `ctx.stream_compute.stream as *mut c_void` keep working.
+    pub struct CudaStream {
+        pub stream: *mut c_void,
+        _device: Arc<CudaDevice>,
+    }
+
+    unsafe impl Send for CudaStream {}
+    unsafe impl Sync for CudaStream {}
+
+    impl Drop for CudaStream {
+        fn drop(&mut self) {
+            if !self.stream.is_null() {
+                unsafe {
+                    let _ = hipStreamDestroy(self.stream);
+                }
+            }
+        }
+    }
+
+    /// HIP device handle. Stand-in for `cudarc::CudaDevice`; created via
+    /// `CudaDevice::new(ordinal)` and shared as `Arc<CudaDevice>` exactly like
+    /// cudarc (whose `new` already returns the `Arc`).
+    pub struct CudaDevice {
+        ordinal: usize,
+    }
+
+    impl CudaDevice {
+        /// Select device `ordinal` and return a shared handle, or an error if no
+        /// such device exists. Matches `cudarc::CudaDevice::new`.
+        pub fn new(ordinal: usize) -> Result<Arc<Self>, DriverError> {
+            unsafe {
+                let mut count: i32 = 0;
+                check(hipGetDeviceCount(&mut count))?;
+                if ordinal as i32 >= count {
+                    return Err(DriverError(101)); // hipErrorInvalidDevice
+                }
+                check(hipSetDevice(ordinal as i32))?;
+            }
+            Ok(Arc::new(Self { ordinal }))
+        }
+
+        pub fn ordinal(&self) -> usize {
+            self.ordinal
+        }
+
+        fn bind(&self) -> Result<(), DriverError> {
+            unsafe { check(hipSetDevice(self.ordinal as i32)) }
+        }
+
+        /// Allocate `len` uninitialized elements of `T` on the device.
+        ///
+        /// # Safety
+        /// Contents are uninitialized until written, mirroring `cudarc`'s
+        /// `unsafe fn alloc`.
+        pub unsafe fn alloc<T>(self: &Arc<Self>, len: usize) -> Result<CudaSlice<T>, DriverError> {
+            self.bind()?;
+            let bytes = len.saturating_mul(std::mem::size_of::<T>());
+            let mut ptr: *mut c_void = std::ptr::null_mut();
+            unsafe {
+                check(hipMalloc(&mut ptr, bytes.max(1)))?;
+            }
+            Ok(CudaSlice {
+                ptr: ptr as u64,
+                len,
+                _device: Arc::clone(self),
+                _marker: PhantomData,
+            })
+        }
+
+        /// Allocate `len` zero-initialized elements of `T` on the device.
+        pub fn alloc_zeros<T: ValidAsZeroBits>(
+            self: &Arc<Self>,
+            len: usize,
+        ) -> Result<CudaSlice<T>, DriverError> {
+            let slice = unsafe { self.alloc::<T>(len)? };
+            let bytes = len.saturating_mul(std::mem::size_of::<T>());
+            if bytes > 0 {
+                unsafe {
+                    check(hipMemset(slice.raw_ptr(), 0, bytes))?;
+                    sync_default_stream()?;
+                }
+            }
+            Ok(slice)
+        }
+
+        /// Copy a host slice to a freshly allocated device buffer (blocking).
+        pub fn htod_sync_copy<T: DeviceRepr>(
+            self: &Arc<Self>,
+            src: &[T],
+        ) -> Result<CudaSlice<T>, DriverError> {
+            let mut slice = unsafe { self.alloc::<T>(src.len())? };
+            self.htod_sync_copy_into(src, &mut slice)?;
+            Ok(slice)
+        }
+
+        /// Copy an owned host Vec to a freshly allocated device buffer. cudarc's
+        /// `htod_copy` keeps the Vec alive until an async copy completes; our
+        /// copy is synchronous (blocking hipMemcpy), so the Vec can be dropped
+        /// on return with identical observable behavior.
+        pub fn htod_copy<T: DeviceRepr>(
+            self: &Arc<Self>,
+            src: Vec<T>,
+        ) -> Result<CudaSlice<T>, DriverError> {
+            self.htod_sync_copy(&src)
+        }
+
+        /// Copy a host slice into an existing device buffer or sub-view
+        /// (blocking). Accepts any `DevicePtrMut` target so both `CudaSlice` and
+        /// the `slice_mut` view work, matching cudarc's generic destination.
+        pub fn htod_sync_copy_into<T: DeviceRepr, D: DevicePtrMut<T> + DeviceSlice<T>>(
+            self: &Arc<Self>,
+            src: &[T],
+            dst: &mut D,
+        ) -> Result<(), DriverError> {
+            assert_eq!(
+                dst.len(),
+                src.len(),
+                "htod_sync_copy_into: dst.len() != src.len()"
+            );
+            self.bind()?;
+            let bytes = std::mem::size_of_val(src);
+            if bytes > 0 {
+                unsafe {
+                    check(hipMemcpy(
+                        (*dst.device_ptr_mut()) as *mut c_void,
+                        src.as_ptr() as *const c_void,
+                        bytes,
+                        HIP_MEMCPY_HOST_TO_DEVICE,
+                    ))?;
+                    sync_default_stream()?;
+                }
+            }
+            Ok(())
+        }
+
+        /// Copy a device buffer back to a freshly allocated host Vec (blocking).
+        ///
+        /// Matches cudarc's bound of just `DeviceRepr` (no `Default`): the Vec is
+        /// allocated uninitialized and every byte is written by the copy before
+        /// its length is set, which is sound because `DeviceRepr` is plain data.
+        pub fn dtoh_sync_copy<T: DeviceRepr>(
+            self: &Arc<Self>,
+            src: &CudaSlice<T>,
+        ) -> Result<Vec<T>, DriverError> {
+            self.bind()?;
+            let len = src.len;
+            let mut out: Vec<T> = Vec::with_capacity(len);
+            let bytes = len.saturating_mul(std::mem::size_of::<T>());
+            if bytes > 0 {
+                unsafe {
+                    check(hipMemcpy(
+                        out.as_mut_ptr() as *mut c_void,
+                        src.raw_ptr() as *const c_void,
+                        bytes,
+                        HIP_MEMCPY_DEVICE_TO_HOST,
+                    ))?;
+                }
+            }
+            unsafe {
+                out.set_len(len);
+            }
+            Ok(out)
+        }
+
+        /// Block until all work on the device's default stream completes.
+        pub fn synchronize(&self) -> Result<(), DriverError> {
+            self.bind()?;
+            unsafe { check(hipDeviceSynchronize()) }
+        }
+
+        /// Create a new stream tied to this device. Mirrors
+        /// `cudarc::CudaDevice::fork_default_stream`.
+        pub fn fork_default_stream(self: &Arc<Self>) -> Result<CudaStream, DriverError> {
+            self.bind()?;
+            let mut stream: *mut c_void = std::ptr::null_mut();
+            unsafe {
+                // Non-blocking stream (matches cudarc on both Linux and Windows): a
+                // default/blocking stream would serialize H2D copies against the NULL
+                // stream and defeat the dual-stream copy/compute overlap the pipeline
+                // relies on. Because a non-blocking stream does not implicitly order
+                // against the default stream, any host-visible readback (a default/NULL
+                // stream dtoh copy) of results produced on this stream MUST first
+                // synchronize this stream (wait_for / sync_cuda_stream); the encoders
+                // already do so before every readback.
+                check(hipStreamCreateWithFlags(
+                    &mut stream,
+                    HIP_STREAM_NON_BLOCKING,
+                ))?;
+            }
+            Ok(CudaStream {
+                stream,
+                _device: Arc::clone(self),
+            })
+        }
+
+        /// Block until all work on `stream` completes. Mirrors
+        /// `cudarc::CudaDevice::wait_for`.
+        pub fn wait_for(&self, stream: &CudaStream) -> Result<(), DriverError> {
+            unsafe { check(hipStreamSynchronize(stream.stream)) }
+        }
+    }
+}
diff --git a/qdp/qdp-kernels/src/kernel_compat.h b/qdp/qdp-kernels/src/kernel_compat.h
new file mode 100644
index 0000000000..6720705d06
--- /dev/null
+++ b/qdp/qdp-kernels/src/kernel_compat.h
@@ -0,0 +1,35 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Cross-vendor kernel compatibility shims.
+//
+// Included by the kernel TUs that use warp intrinsics (amplitude.cu). On CUDA
+// it is inert. On HIP it supplies the one warp-intrinsic difference that does
+// not translate 1:1: the full-warp lane mask for __shfl_*_sync.
+
+#ifndef KERNEL_COMPAT_H
+#define KERNEL_COMPAT_H
+
+#if defined(__HIP_PLATFORM_AMD__)
+// ROCm's __shfl_*_sync static_asserts a 64-bit mask (sizeof(MaskT) == 8): the
+// 32-bit literal 0xffffffff every CUDA warp-sync uses fails to COMPILE,
+// independent of the active wave width. Use an all-lanes 64-bit mask.
+#define QDP_FULL_WARP_MASK 0xffffffffffffffffULL
+#else
+#define QDP_FULL_WARP_MASK 0xffffffffu
+#endif
+
+#endif // KERNEL_COMPAT_H
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index 9a1f832d85..f44a15ce4e 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -20,6 +20,9 @@
 
 use std::ffi::c_void;
 
+pub mod device;
+use device::{DeviceRepr, ValidAsZeroBits};
+
 // Complex number (matches CUDA's cuDoubleComplex)
 #[repr(C)]
 #[derive(Debug, Clone, Copy)]
@@ -28,13 +31,12 @@ pub struct CuDoubleComplex {
     pub y: f64, // Imaginary part
 }
 
-// Implement DeviceRepr for cudarc compatibility
-#[cfg(target_os = "linux")]
-unsafe impl cudarc::driver::DeviceRepr for CuDoubleComplex {}
-
-// Also implement ValidAsZeroBits for alloc_zeros support
-#[cfg(target_os = "linux")]
-unsafe impl cudarc::driver::ValidAsZeroBits for CuDoubleComplex {}
+// Device-transferable + zero-initializable markers. On CUDA these resolve to
+// cudarc's traits; on HIP they resolve to the in-crate `device` shim's traits.
+#[cfg(qdp_gpu_platform)]
+unsafe impl DeviceRepr for CuDoubleComplex {}
+#[cfg(qdp_gpu_platform)]
+unsafe impl ValidAsZeroBits for CuDoubleComplex {}
 
 // Complex number (matches CUDA's cuComplex / cuFloatComplex)
 #[repr(C)]
@@ -44,16 +46,13 @@ pub struct CuComplex {
     pub y: f32, // Imaginary part
 }
 
-// Implement DeviceRepr for cudarc compatibility
-#[cfg(target_os = "linux")]
-unsafe impl cudarc::driver::DeviceRepr for CuComplex {}
-
-// Also implement ValidAsZeroBits for alloc_zeros support
-#[cfg(target_os = "linux")]
-unsafe impl cudarc::driver::ValidAsZeroBits for CuComplex {}
+#[cfg(qdp_gpu_platform)]
+unsafe impl DeviceRepr for CuComplex {}
+#[cfg(qdp_gpu_platform)]
+unsafe impl ValidAsZeroBits for CuComplex {}
 
 // CUDA kernel FFI (Linux only; stubbed when built without nvcc/CUDA)
-#[cfg(all(target_os = "linux", not(qdp_no_cuda)))]
+#[cfg(all(qdp_gpu_platform, not(qdp_no_cuda)))]
 unsafe extern "C" {
     /// Launch amplitude encoding kernel
     /// Returns CUDA error code (0 = success)
@@ -405,7 +404,7 @@ unsafe extern "C" {
 }
 
 // Dummy implementation for non-Linux and Linux builds without CUDA (allows linking)
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_amplitude_encode(
     _input_d: *const f64,
@@ -418,7 +417,7 @@ pub extern "C" fn launch_amplitude_encode(
     999 // Error: CUDA unavailable
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_amplitude_encode_f32(
     _input_d: *const f32,
@@ -431,7 +430,7 @@ pub extern "C" fn launch_amplitude_encode_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_amplitude_encode_batch(
     _input_batch_d: *const f64,
@@ -445,7 +444,7 @@ pub extern "C" fn launch_amplitude_encode_batch(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_amplitude_encode_batch_f32(
     _input_batch_d: *const f32,
@@ -459,7 +458,7 @@ pub extern "C" fn launch_amplitude_encode_batch_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_l2_norm(
     _input_d: *const f64,
@@ -470,7 +469,7 @@ pub extern "C" fn launch_l2_norm(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_l2_norm_batch(
     _input_batch_d: *const f64,
@@ -482,7 +481,7 @@ pub extern "C" fn launch_l2_norm_batch(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_l2_norm_f32(
     _input_d: *const f32,
@@ -493,7 +492,7 @@ pub extern "C" fn launch_l2_norm_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_l2_norm_batch_f32(
     _input_batch_d: *const f32,
@@ -505,7 +504,7 @@ pub extern "C" fn launch_l2_norm_batch_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn convert_state_to_float(
     _input_state_d: *const CuDoubleComplex,
@@ -516,7 +515,7 @@ pub extern "C" fn convert_state_to_float(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn convert_state_to_double(
     _input_state_d: *const CuComplex,
@@ -527,7 +526,7 @@ pub extern "C" fn convert_state_to_double(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_basis_encode(
     _basis_index: usize,
@@ -538,7 +537,7 @@ pub extern "C" fn launch_basis_encode(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_basis_encode_batch(
     _basis_indices_d: *const usize,
@@ -551,7 +550,7 @@ pub extern "C" fn launch_basis_encode_batch(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_basis_encode_f32(
     _basis_index: usize,
@@ -562,7 +561,7 @@ pub extern "C" fn launch_basis_encode_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_basis_encode_batch_f32(
     _basis_indices_d: *const usize,
@@ -575,7 +574,7 @@ pub extern "C" fn launch_basis_encode_batch_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_angle_encode(
     _angles_d: *const f64,
@@ -587,7 +586,7 @@ pub extern "C" fn launch_angle_encode(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_angle_encode_batch(
     _angles_batch_d: *const f64,
@@ -600,7 +599,7 @@ pub extern "C" fn launch_angle_encode_batch(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_angle_encode_f32(
     _angles_d: *const f32,
@@ -612,7 +611,7 @@ pub extern "C" fn launch_angle_encode_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_angle_encode_batch_f32(
     _angles_batch_d: *const f32,
@@ -625,7 +624,7 @@ pub extern "C" fn launch_angle_encode_batch_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_check_finite_batch_f32(
     _input_batch_d: *const f32,
@@ -636,7 +635,7 @@ pub extern "C" fn launch_check_finite_batch_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_check_finite_batch_f64(
     _input_batch_d: *const f64,
@@ -647,7 +646,7 @@ pub extern "C" fn launch_check_finite_batch_f64(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_validate_and_cast_basis_indices_f32(
     _input_batch_d: *const f32,
@@ -660,7 +659,7 @@ pub extern "C" fn launch_validate_and_cast_basis_indices_f32(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_check_basis_indices_usize(
     _indices_d: *const usize,
@@ -672,7 +671,7 @@ pub extern "C" fn launch_check_basis_indices_usize(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_iqp_encode(
     _data_d: *const f64,
@@ -685,7 +684,7 @@ pub extern "C" fn launch_iqp_encode(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_iqp_encode_batch(
     _data_batch_d: *const f64,
@@ -700,7 +699,7 @@ pub extern "C" fn launch_iqp_encode_batch(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_phase_encode(
     _phases_d: *const f64,
@@ -712,7 +711,7 @@ pub extern "C" fn launch_phase_encode(
     999
 }
 
-#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[cfg(any(not(qdp_gpu_platform), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_phase_encode_batch(
     _phases_batch_d: *const f64,
diff --git a/qdp/qdp-kernels/tests/amplitude_encode.rs b/qdp/qdp-kernels/tests/amplitude_encode.rs
index 53f91505f5..8a413fa9e8 100644
--- a/qdp/qdp-kernels/tests/amplitude_encode.rs
+++ b/qdp/qdp-kernels/tests/amplitude_encode.rs
@@ -20,10 +20,14 @@
 // but in CUDA builds, the extern "C" functions require unsafe blocks.
 // The compiler can't statically determine which path is taken.
 #![allow(unused_unsafe)]
-
-#[cfg(target_os = "linux")]
-use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
-#[cfg(target_os = "linux")]
+// The `stream.stream as *mut c_void` cast is a real conversion on the CUDA
+// backend (cudarc's CUstream is a distinct pointer type) but a no-op on HIP,
+// where the stream field is already *mut c_void; silence the HIP-only redundancy.
+#![cfg_attr(feature = "hip", allow(clippy::unnecessary_cast))]
+
+#[cfg(qdp_gpu_platform)]
+use qdp_kernels::device::{CudaDevice, DevicePtr, DevicePtrMut};
+#[cfg(qdp_gpu_platform)]
 use qdp_kernels::{
     CuComplex, CuDoubleComplex, launch_amplitude_encode, launch_amplitude_encode_batch,
     launch_amplitude_encode_f32, launch_l2_norm, launch_l2_norm_batch, launch_l2_norm_batch_f32,
@@ -33,7 +37,7 @@ use qdp_kernels::{
 const EPSILON: f64 = 1e-10;
 const EPSILON_F32: f32 = 1e-5;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn assert_batch_state_matches_f64(
     state_h: &[CuDoubleComplex],
     input: &[f64],
@@ -70,7 +74,7 @@ fn assert_batch_state_matches_f64(
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_basic() {
     println!("Testing basic amplitude encoding...");
 
@@ -141,7 +145,7 @@ fn test_amplitude_encode_basic() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_basic_f32() {
     println!("Testing basic amplitude encoding (float32)...");
 
@@ -211,7 +215,7 @@ fn test_amplitude_encode_basic_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_power_of_two() {
     println!("Testing amplitude encoding with power-of-two input...");
 
@@ -271,7 +275,7 @@ fn test_amplitude_encode_power_of_two() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_odd_input_length() {
     println!("Testing amplitude encoding with odd input length...");
 
@@ -320,7 +324,7 @@ fn test_amplitude_encode_odd_input_length() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_odd_input_length_f32() {
     println!("Testing amplitude encoding with odd input length (float32)...");
 
@@ -369,7 +373,7 @@ fn test_amplitude_encode_odd_input_length_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_large_state() {
     println!("Testing amplitude encoding with large state vector...");
 
@@ -427,7 +431,7 @@ fn test_amplitude_encode_large_state() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_zero_norm_error() {
     println!("Testing amplitude encoding with zero norm (error case)...");
 
@@ -467,7 +471,7 @@ fn test_amplitude_encode_zero_norm_error() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_negative_norm_error() {
     println!("Testing amplitude encoding with negative norm (error case)...");
 
@@ -507,7 +511,7 @@ fn test_amplitude_encode_negative_norm_error() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_vectorized_load() {
     println!("Testing vectorized double2 memory access optimization...");
 
@@ -559,7 +563,7 @@ fn test_amplitude_encode_vectorized_load() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_small_input_large_state() {
     println!("Testing small input with large state vector...");
 
@@ -612,7 +616,7 @@ fn test_amplitude_encode_small_input_large_state() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_amplitude_encode_batch_odd_sample_length_handles_misaligned_samples() {
     println!("Testing batch amplitude encoding with odd sample length (float64)...");
 
@@ -658,7 +662,7 @@ fn test_amplitude_encode_batch_odd_sample_length_handles_misaligned_samples() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_single_kernel() {
     println!("Testing single-vector GPU norm reduction...");
 
@@ -698,7 +702,7 @@ fn test_l2_norm_single_kernel() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_stream() {
     println!("Testing batched norm reduction on async stream...");
 
@@ -755,7 +759,7 @@ fn test_l2_norm_batch_kernel_stream() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_odd_sample_len() {
     println!("Testing batched L2 norm reduction with odd sample length (float64)...");
 
@@ -804,7 +808,7 @@ fn test_l2_norm_batch_kernel_odd_sample_len() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_zero_num_samples() {
     println!("Testing batched L2 norm rejection when num_samples==0 (float64)...");
 
@@ -838,7 +842,7 @@ fn test_l2_norm_batch_kernel_zero_num_samples() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_zero_sample_len() {
     println!("Testing batched L2 norm rejection when sample_len==0 (float64)...");
 
@@ -872,7 +876,7 @@ fn test_l2_norm_batch_kernel_zero_sample_len() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_single_kernel_f32() {
     println!("Testing L2 norm reduction kernel (float32)...");
 
@@ -918,7 +922,7 @@ fn test_l2_norm_single_kernel_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_f32() {
     println!("Testing batched L2 norm reduction kernel (float32)...");
 
@@ -973,7 +977,7 @@ fn test_l2_norm_batch_kernel_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_f32_odd_sample_len() {
     println!("Testing batched L2 norm reduction with odd sample length (float32)...");
 
@@ -1022,7 +1026,7 @@ fn test_l2_norm_batch_kernel_f32_odd_sample_len() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_zero_num_samples_f32() {
     println!("Testing batched L2 norm rejection when num_samples==0 (float32)...");
 
@@ -1056,7 +1060,7 @@ fn test_l2_norm_batch_kernel_zero_num_samples_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_l2_norm_batch_kernel_zero_sample_len_f32() {
     println!("Testing batched L2 norm rejection when sample_len==0 (float32)...");
 
@@ -1090,7 +1094,7 @@ fn test_l2_norm_batch_kernel_zero_sample_len_f32() {
 }
 
 #[test]
-#[cfg(not(target_os = "linux"))]
+#[cfg(not(qdp_gpu_platform))]
 fn test_amplitude_encode_dummy_non_linux() {
     println!("Testing dummy implementation on non-Linux platform...");
 
diff --git a/qdp/qdp-kernels/tests/angle_encode.rs b/qdp/qdp-kernels/tests/angle_encode.rs
index a4af609af1..72a659f581 100644
--- a/qdp/qdp-kernels/tests/angle_encode.rs
+++ b/qdp/qdp-kernels/tests/angle_encode.rs
@@ -18,9 +18,9 @@
 
 #![allow(unused_unsafe)]
 
-#[cfg(target_os = "linux")]
-use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
+use qdp_kernels::device::{CudaDevice, DevicePtr, DevicePtrMut};
+#[cfg(qdp_gpu_platform)]
 use qdp_kernels::{
     CuComplex, launch_angle_encode_batch_f32, launch_angle_encode_f32,
     launch_check_finite_batch_f32,
@@ -28,7 +28,7 @@ use qdp_kernels::{
 
 const EPSILON_F32: f32 = 1e-5;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn expected_amplitude_f32(angles: &[f32], basis_idx: usize) -> f32 {
     angles.iter().enumerate().fold(1.0f32, |acc, (bit, angle)| {
         let factor = if ((basis_idx >> bit) & 1) == 1 {
@@ -41,7 +41,7 @@ fn expected_amplitude_f32(angles: &[f32], basis_idx: usize) -> f32 {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_basic_f32() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -86,7 +86,7 @@ fn test_angle_encode_basic_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_matches_expected_product_state_f32() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -136,7 +136,7 @@ fn test_angle_encode_matches_expected_product_state_f32() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_f32_rejects_zero_qubits() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -164,7 +164,7 @@ fn test_angle_encode_f32_rejects_zero_qubits() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_batch_f32_matches_expected_product_states() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -224,7 +224,7 @@ fn test_angle_encode_batch_f32_matches_expected_product_states() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_batch_f32_rejects_zero_samples() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -252,7 +252,7 @@ fn test_angle_encode_batch_f32_rejects_zero_samples() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_check_finite_batch_f32_reports_non_finite() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -282,7 +282,7 @@ fn test_check_finite_batch_f32_reports_non_finite() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_check_finite_batch_f32_reports_nan() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -312,7 +312,7 @@ fn test_check_finite_batch_f32_reports_nan() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_check_finite_batch_f32_all_finite_stays_clear() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -342,7 +342,7 @@ fn test_check_finite_batch_f32_all_finite_stays_clear() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_batch_f32_rejects_zero_state_len() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
@@ -370,7 +370,7 @@ fn test_angle_encode_batch_f32_rejects_zero_state_len() {
 }
 
 #[test]
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 fn test_angle_encode_batch_f32_rejects_zero_qubits() {
     let device = match CudaDevice::new(0) {
         Ok(d) => d,
diff --git a/qdp/qdp-python/Cargo.toml b/qdp/qdp-python/Cargo.toml
index fbe46a0198..2dbf659ec2 100644
--- a/qdp/qdp-python/Cargo.toml
+++ b/qdp/qdp-python/Cargo.toml
@@ -10,11 +10,15 @@ crate-type = ["cdylib"]
 [dependencies]
 pyo3 = { version = "0.27" }
 numpy = "0.27"
-qdp-core = { path = "../qdp-core" }
+qdp-core = { path = "../qdp-core", default-features = false }
 env_logger = "0.11"
 
 [features]
-default = []
+# NVIDIA CUDA backend (default); pass through to qdp-core.
+default = ["cuda"]
+cuda = ["qdp-core/cuda"]
+# AMD HIP backend; pass through to qdp-core.
+hip = ["qdp-core/hip"]
 observability = ["qdp-core/observability"]
 pytorch = ["qdp-core/pytorch"]
 remote-io = ["qdp-core/remote-io"]
diff --git a/qdp/qdp-python/README.md b/qdp/qdp-python/README.md
index a81f95324d..d770cc9774 100644
--- a/qdp/qdp-python/README.md
+++ b/qdp/qdp-python/README.md
@@ -64,6 +64,13 @@ The public `QdpEngine` is a unified Python facade with explicit backend selectio
 - `backend="cuda"` routes to the Rust `_qdp.QdpEngine`
 - `backend="amd"` routes to the Triton AMD engine directly
 
+The native Rust `_qdp.QdpEngine` (the `backend="cuda"` route) also runs on AMD
+GPUs when the extension is built from source with the `hip` Cargo feature, which
+compiles the same kernels with `hipcc` and binds the AMD HIP runtime. This gives
+the native pipeline (pinned-buffer pool, dual-stream overlap, in-Rust DLPack
+ownership) on AMD without the Triton path. See `qdp/DEVELOPMENT.md` for the
+ROCm/HIP build steps.
+
 See `qdp/qdp-python/TRITON_AMD_BACKEND.md` for Triton AMD setup and validation details.
 
 ## Encoding Methods
diff --git a/qdp/qdp-python/build.rs b/qdp/qdp-python/build.rs
new file mode 100644
index 0000000000..8a133e62ab
--- /dev/null
+++ b/qdp/qdp-python/build.rs
@@ -0,0 +1,27 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+fn main() {
+    // Emit qdp_gpu_platform when building for a GPU-capable OS (Linux always;
+    // Windows when the hip feature is on via QDP_USE_HIP=1 / TheRock ROCm).
+    println!("cargo::rustc-check-cfg=cfg(qdp_gpu_platform)");
+    let is_linux = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("linux");
+    let hip_feature = std::env::var("CARGO_FEATURE_HIP").is_ok();
+    let is_windows = std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows");
+    if is_linux || (is_windows && hip_feature) {
+        println!("cargo::rustc-cfg=qdp_gpu_platform");
+    }
+}
diff --git a/qdp/qdp-python/src/engine.rs b/qdp/qdp-python/src/engine.rs
index 8297bf5e78..2c44608f99 100644
--- a/qdp/qdp-python/src/engine.rs
+++ b/qdp/qdp-python/src/engine.rs
@@ -24,7 +24,7 @@ use pyo3::exceptions::PyRuntimeError;
 use pyo3::prelude::*;
 use qdp_core::{Dtype, Encoding, QdpEngine as CoreEngine};
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use crate::loader::{PyQuantumLoader, config_from_args, parse_null_handling, path_from_py};
 
 /// PyO3 wrapper for QdpEngine
@@ -522,7 +522,7 @@ impl QdpEngine {
     }
 
     // --- Loader factory methods (Linux only) ---
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     /// Create a synthetic-data pipeline iterator (for QuantumDataLoader.source_synthetic()).
     #[pyo3(signature = (total_batches, batch_size, num_qubits, encoding_method, seed=None, null_handling=None))]
     fn create_synthetic_loader(
@@ -551,7 +551,7 @@ impl QdpEngine {
         Ok(PyQuantumLoader::new(Some(iter)))
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     /// Create a file-backed pipeline iterator (full read then batch; for QuantumDataLoader.source_file(path)).
     #[allow(clippy::too_many_arguments)]
     #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, batch_limit=None, null_handling=None))]
@@ -600,7 +600,7 @@ impl QdpEngine {
         Ok(PyQuantumLoader::new(Some(iter)))
     }
 
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     /// Create a streaming Parquet pipeline iterator (for QuantumDataLoader.source_file(path, streaming=True)).
     #[allow(clippy::too_many_arguments)]
     #[pyo3(signature = (path, batch_size, num_qubits, encoding_method, batch_limit=None, null_handling=None))]
diff --git a/qdp/qdp-python/src/lib.rs b/qdp/qdp-python/src/lib.rs
index 04d772a906..cd7f19200f 100644
--- a/qdp/qdp-python/src/lib.rs
+++ b/qdp/qdp-python/src/lib.rs
@@ -26,10 +26,10 @@ use pyo3::exceptions::PyRuntimeError;
 use pyo3::prelude::*;
 use tensor::QuantumTensor;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 use loader::PyQuantumLoader;
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 #[pyfunction]
 #[pyo3(signature = (device_id, num_qubits, batch_size, total_batches, encoding_method, warmup_batches=0, seed=None, dtype="f64"))]
 #[allow(clippy::too_many_arguments)]
@@ -78,9 +78,9 @@ fn _qdp(m: &Bound<'_, PyModule>) -> PyResult<()> {
 
     m.add_class::<QdpEngine>()?;
     m.add_class::<QuantumTensor>()?;
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     m.add_class::<PyQuantumLoader>()?;
-    #[cfg(target_os = "linux")]
+    #[cfg(qdp_gpu_platform)]
     m.add_function(wrap_pyfunction!(run_throughput_pipeline_py, m)?)?;
     Ok(())
 }
diff --git a/qdp/qdp-python/src/loader.rs b/qdp/qdp-python/src/loader.rs
index a43f947944..46e5b14642 100644
--- a/qdp/qdp-python/src/loader.rs
+++ b/qdp/qdp-python/src/loader.rs
@@ -15,7 +15,7 @@
 // limitations under the License.
 
 // Loader bindings (Linux only; qdp-core pipeline types only built on Linux)
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 mod loader_impl {
     use crate::tensor::QuantumTensor;
     use pyo3::exceptions::PyRuntimeError;
@@ -120,5 +120,5 @@ mod loader_impl {
     }
 }
 
-#[cfg(target_os = "linux")]
+#[cfg(qdp_gpu_platform)]
 pub use loader_impl::{PyQuantumLoader, config_from_args, parse_null_handling, path_from_py};
diff --git a/testing/qdp/test_bindings.py b/testing/qdp/test_bindings.py
index eb5d92ce9b..0c8a985b88 100644
--- a/testing/qdp/test_bindings.py
+++ b/testing/qdp/test_bindings.py
@@ -122,7 +122,14 @@ def test_dlpack_device():
     qtensor = engine.encode(data, 2, "amplitude")
 
     device_info = qtensor.__dlpack_device__()
-    assert device_info == (2, 0), "Expected (2, 0) for CUDA device 0"
+    # DLPack device_type: kDLCUDA=2 (NVIDIA), kDLROCM=10 (AMD HIP). The native
+    # engine tags exported tensors to match the backend it was built for, so a
+    # ROCm build must report kDLROCM (which a ROCm PyTorch's from_dlpack
+    # requires); a CUDA build reports kDLCUDA.
+    expected_device_type = 10 if getattr(torch.version, "hip", None) else 2
+    assert device_info == (expected_device_type, 0), (
+        f"Expected ({expected_device_type}, 0) for device 0"
+    )
 
 
 @requires_qdp
@@ -142,10 +149,12 @@ def test_dlpack_device_id_non_zero():
     qtensor = engine.encode(data, 2, "amplitude")
 
     device_info = qtensor.__dlpack_device__()
+    # DLPack device_type: kDLCUDA=2 (NVIDIA), kDLROCM=10 (AMD HIP).
+    expected_device_type = 10 if getattr(torch.version, "hip", None) else 2
     assert device_info == (
-        2,
+        expected_device_type,
         device_id,
-    ), f"Expected (2, {device_id}) for CUDA device {device_id}"
+    ), f"Expected ({expected_device_type}, {device_id}) for device {device_id}"
 
     # Verify PyTorch integration works with non-zero device_id
     torch_tensor = torch.from_dlpack(qtensor)