[SYCLomatic] Enable the migration of 2 API cub::BlockExchange.WarpStripedToBlocked/BlockedToWarpStriped with help function(#2679)

intwanghao · web-flow · commit ff384c9982e4 · 2025-03-05T13:30:35.000+08:00
Signed-off-by: intwanghao &lt;hao3.wang@intel.com&gt;
diff --git a/clang/lib/DPCT/RulesLang/RewriterSYCLcompat.cpp b/clang/lib/DPCT/RulesLang/RewriterSYCLcompat.cpp
@@ -94,6 +94,8 @@ SYCLCOMPAT_UNSUPPORT("cub::BlockExchange.BlockedToStriped")
 SYCLCOMPAT_UNSUPPORT("cub::BlockExchange.StripedToBlocked")
 SYCLCOMPAT_UNSUPPORT("cub::BlockExchange.ScatterToBlocked")
 SYCLCOMPAT_UNSUPPORT("cub::BlockExchange.ScatterToStriped")
+SYCLCOMPAT_UNSUPPORT("cub::BlockExchange.WarpStripedToBlocked")
+SYCLCOMPAT_UNSUPPORT("cub::BlockExchange.BlockedToWarpStriped")
 SYCLCOMPAT_UNSUPPORT("cub::BlockShuffle.Offset")
 SYCLCOMPAT_UNSUPPORT("cub::BlockShuffle.Rotate")
 SYCLCOMPAT_UNSUPPORT("cub::BlockShuffle.Up")
diff --git a/clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp b/clang/lib/DPCT/RulesLangLib/CUB/RewriterClassMethods.cpp
@@ -211,6 +211,28 @@ RewriterMap dpct::createClassMethodsRewriterMap() {
                                 "cub::BlockExchange.ScatterToStriped",
                                 MemberExprBase(), false, "scatter_to_striped",
                                 NDITEM, ARG(0), ARG(1)))
+      // cub::BlockExchange.BlockedToWarpStriped
+      SUBGROUPSIZE_FACTORY(
+          UINT_MAX,
+          MapNames::getDpctNamespace() +
+              "exchange.blocked_to_sub_group_striped",
+          HEADER_INSERT_FACTORY(HeaderType::HT_DPCT_GROUP_Utils,
+                                MEMBER_CALL_FACTORY_ENTRY(
+                                    "cub::BlockExchange.BlockedToWarpStriped",
+                                    MemberExprBase(), false,
+                                    "blocked_to_sub_group_striped", NDITEM,
+                                    ARG(0), ARG(1))))
+      // cub::BlockExchange.WarpStripedToBlocked
+      SUBGROUPSIZE_FACTORY(
+          UINT_MAX,
+          MapNames::getDpctNamespace() +
+              "exchange.sub_group_striped_to_blocked",
+          HEADER_INSERT_FACTORY(HeaderType::HT_DPCT_GROUP_Utils,
+                                MEMBER_CALL_FACTORY_ENTRY(
+                                    "cub::BlockExchange.WarpStripedToBlocked",
+                                    MemberExprBase(), false,
+                                    "sub_group_striped_to_blocked", NDITEM,
+                                    ARG(0), ARG(1))))
       // cub::BlockShuffle.Offset
       HEADER_INSERT_FACTORY(
           HeaderType::HT_DPCT_GROUP_Utils,
diff --git a/clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp b/clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp
@@ -167,7 +167,8 @@ void CubMemberCallRule::registerMatcher(ast_matchers::MatchFinder &MF) {
                     "normalize", "Sort", "SortDescending", "BlockedToStriped",
                     "StripedToBlocked", "ScatterToBlocked", "ScatterToStriped",
                     "SortBlockedToStriped", "SortDescendingBlockedToStriped",
-                    "Load", "Store", "Offset", "Rotate", "Up", "Down")))))
+                    "Load", "Store", "Offset", "Rotate", "Up", "Down",
+                    "BlockedToWarpStriped", "WarpStripedToBlocked")))))
           .bind("memberCall"),
       this);
 
@@ -253,7 +254,8 @@ void CubMemberCallRule::runRule(
     bool isBlockExchange =
         Name == "BlockedToStriped" || Name == "StripedToBlocked" ||
         Name == "StripedToBlocked" || Name == "ScatterToBlocked" ||
-        Name == "ScatterToStriped";
+        Name == "ScatterToStriped" || Name == "WarpStripedToBlocked" ||
+        Name == "BlockedToWarpStriped";
     bool isBlockShuffle =
         Name == "Offset" || Name == "Rotate" || Name == "Up" || Name == "Down";
     if (isBlockRadixSort || isBlockExchange || isBlockShuffle ||
diff --git a/clang/lib/DPCT/SrcAPI/APINames_CUB.inc b/clang/lib/DPCT/SrcAPI/APINames_CUB.inc
@@ -98,8 +98,8 @@ ENTRY_MEMBER_FUNCTION(cub::BlockDiscontinuity, cub::BlockDiscontinuity, FlagTail
 ENTRY_MEMBER_FUNCTION(cub::BlockDiscontinuity, cub::BlockDiscontinuity, FlagHeadsAndTails, FlagHeadsAndTails, false, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, StripedToBlocked, StripedToBlocked, true, NO_FLAG, P4, "Successful")
 ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, BlockedToStriped, BlockedToStriped, true, NO_FLAG, P4, "Successful")
-ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, WarpStripedToBlocked, WarpStripedToBlocked, false, NO_FLAG, P4, "Comment")
-ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, BlockedToWarpStriped, BlockedToWarpStriped, false, NO_FLAG, P4, "Comment")
+ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, WarpStripedToBlocked, WarpStripedToBlocked, true, NO_FLAG, P4, "Comment")
+ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, BlockedToWarpStriped, BlockedToWarpStriped, true, NO_FLAG, P4, "Comment")
 ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, ScatterToBlocked, ScatterToBlocked, true, NO_FLAG, P4, "Successful")
 ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, ScatterToStriped, ScatterToStriped, true, NO_FLAG, P4, "Successful")
 ENTRY_MEMBER_FUNCTION(cub::BlockExchange, cub::BlockExchange, ScatterToStripedGuarded, ScatterToStripedGuarded, false, NO_FLAG, P4, "Comment")
diff --git a/clang/runtime/dpct-rt/include/dpct/group_utils.hpp b/clang/runtime/dpct-rt/include/dpct/group_utils.hpp
@@ -56,13 +56,25 @@ template <typename T, size_t ElementsPerWorkItem> class exchange {
 
   struct striped_offset {
     template <typename Item> size_t operator()(Item item, size_t i) {
-      size_t offset = i * item.get_local_range(2) * item.get_local_range(1) *
-                          item.get_local_range(0) +
+      size_t offset = i * item.get_group().get_local_linear_range() +
                       item.get_local_linear_id();
       return adjust_by_padding(offset);
     }
   };
 
+  struct sub_group_striped_offset {
+    template <typename Item> size_t operator()(Item item, size_t i) {
+      auto sg = item.get_sub_group();
+      size_t wg_size = item.get_group().get_local_linear_range();
+      size_t sub_group_sliced_items =
+          std::min<size_t>(sg.get_local_linear_range(), wg_size);
+      size_t offset = (sg.get_group_linear_id() * sub_group_sliced_items *
+                       ElementsPerWorkItem) +
+                      (i * sub_group_sliced_items) + sg.get_local_linear_id();
+      return adjust_by_padding(offset);
+    }
+  };
+
   template <typename Iterator> struct scatter_offset {
     Iterator begin;
     scatter_offset(const int (&ranks)[ElementsPerWorkItem]) {
@@ -241,6 +253,60 @@ template <typename T, size_t ElementsPerWorkItem> class exchange {
     helper_exchange(item, input, input, get_scatter_offset, get_striped_offset);
   }
 
+  /// Rearrange elements from blocked order to sub_group striped order.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// blocked \p input across the work-group is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// The sub_group striped order output (with sub_group size 2) is:
+  ///
+  ///   { [0, 4, 1, 5], [2, 6, 3, 7], [8, 12, 9, 13], [10, 14, 11, 15], ...
+  ///     , [506, 510, 507, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param output The corresponding output data of each work-item.
+  template <typename Item>
+  __dpct_inline__ void
+  blocked_to_sub_group_striped(Item item, T (&input)[ElementsPerWorkItem],
+                               T (&output)[ElementsPerWorkItem]) {
+    blocked_offset get_blocked_offset;
+    sub_group_striped_offset get_sub_group_striped_offset;
+    helper_exchange(item, input, output, get_blocked_offset,
+                    get_sub_group_striped_offset);
+  }
+
+  /// Rearrange elements from sub_group striped order to blocked order.
+  ///
+  /// Suppose 512 integer data elements partitioned across 128 work-items, where
+  /// each work-item owns 4 ( \p ElementsPerWorkItem ) data elements and the
+  /// sub_group striped \p input across the work-group is:
+  ///
+  ///   { [0, 4, 1, 5], [2, 6, 3, 7], [8, 12, 9, 13], [10, 14, 11, 15], ...
+  ///     , [506, 510, 507, 511] }.
+  ///
+  /// The blocked order output is:
+  ///
+  ///   { [0, 1, 2, 3], [4, 5, 6, 7], ..., [508, 509, 510, 511] }.
+  ///
+  /// \tparam Item The work-item identifier type.
+  /// \param item The work-item identifier.
+  /// \param input The input data of each work-item.
+  /// \param output The corresponding output data of each work-item.
+  template <typename Item>
+  __dpct_inline__ void
+  sub_group_striped_to_blocked(Item item, T (&input)[ElementsPerWorkItem],
+                               T (&output)[ElementsPerWorkItem]) {
+    blocked_offset get_blocked_offset;
+    sub_group_striped_offset get_sub_group_striped_offset;
+    helper_exchange(item, input, output, get_sub_group_striped_offset,
+                    get_blocked_offset);
+  }
+
 private:
   template <typename Item, typename offsetFunctorTypeFW,
             typename offsetFunctorTypeRV>
diff --git a/clang/test/dpct/cub/blocklevel/blockexchange.cu b/clang/test/dpct/cub/blocklevel/blockexchange.cu
@@ -84,6 +84,34 @@ __global__ void ScatterToStripedKernel(int *d_data, int *d_rank) {
   cub::StoreDirectStriped<128>(threadIdx.x, d_data, thread_data);
 }
 
+__global__ void BlockedToWarpStripedKernel(int *d_data) {
+  // CHECK: typedef dpct::group::exchange<int, 4> BlockExchange;
+  // CHECK: int thread_data[4];
+  // CHECK: dpct::group::load_direct_blocked(item_ct1, d_data, thread_data);
+  // CHECK: BlockExchange(temp_storage).blocked_to_sub_group_striped(item_ct1, thread_data, thread_data);
+  // CHECK: dpct::group::store_direct_blocked(item_ct1, d_data, thread_data);
+  typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+  __shared__ typename BlockExchange::TempStorage temp_storage;
+  int thread_data[4];
+  cub::LoadDirectBlocked(threadIdx.x, d_data, thread_data);
+  BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+  cub::StoreDirectBlocked(threadIdx.x, d_data, thread_data);
+}
+
+__global__ void WarpStripedToBlockedKernel(int *d_data) {
+  // CHECK: typedef dpct::group::exchange<int, 4> BlockExchange;
+  // CHECK: int thread_data[4];
+  // CHECK: dpct::group::load_direct_blocked(item_ct1, d_data, thread_data);
+  // CHECK: BlockExchange(temp_storage).sub_group_striped_to_blocked(item_ct1, thread_data, thread_data);
+  // CHECK: dpct::group::store_direct_blocked(item_ct1, d_data, thread_data);
+  typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+  __shared__ typename BlockExchange::TempStorage temp_storage;
+  int thread_data[4];
+  cub::LoadDirectBlocked(threadIdx.x, d_data, thread_data);
+  BlockExchange(temp_storage).WarpStripedToBlocked(thread_data, thread_data);
+  cub::StoreDirectBlocked(threadIdx.x, d_data, thread_data);
+}
+
 bool test_striped_to_blocked() {
   int *d_data;
   cudaMallocManaged(&d_data, sizeof(int) * 512);
@@ -257,7 +285,96 @@ bool test_scatter_to_striped() {
   return true;
 }
 
+bool test_blocked_to_warp_striped() {
+  int *d_data, expected[512];
+  cudaMallocManaged(&d_data, sizeof(int) * 512);
+  for (int i = 0; i < 512; ++i)
+    d_data[i] = i;
+
+
+  // CHECK:  q_ct1.submit(
+  // CHECK-NEXT:    [&](sycl::handler &cgh) {
+  // CHECK-NEXT:      sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::exchange<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
+  // CHECK-EMPTY:
+  // CHECK-NEXT:      cgh.parallel_for(
+  // CHECK-NEXT:        sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), 
+  // CHECK-NEXT:        [=](sycl::nd_item<3> item_ct1) {{\[\[}}sycl::reqd_sub_group_size(32){{\]\]}} {
+  // CHECK-NEXT:          BlockedToWarpStripedKernel(d_data, item_ct1, &temp_storage_acc[0]);
+  // CHECK-NEXT:        });
+  // CHECK-NEXT:    });
+  BlockedToWarpStripedKernel<<<1, 128>>>(d_data);
+  cudaDeviceSynchronize();
+  size_t warp_id = 0, warp_offset = 0, lane_id = 0;
+  for (int i = 0; i < 128; i++) {
+    warp_id = i / 32;
+    lane_id = i % 32;
+    warp_offset = warp_id * 32 * 4;
+    expected[4 * i + 0] = warp_offset + lane_id + 0 * 32;
+    expected[4 * i + 1] = warp_offset + lane_id + 1 * 32;
+    expected[4 * i + 2] = warp_offset + lane_id + 2 * 32;
+    expected[4 * i + 3] = warp_offset + lane_id + 3 * 32;
+  }
+
+  for (int i = 0; i < 512; ++i) {
+    if (expected[i] != d_data[i]) {
+      std::cout << "test_blocked_to_warp_striped failed\n";
+      std::ostream_iterator<int> Iter(std::cout, ", ");
+      std::copy(d_data, d_data + 512, Iter);
+      std::cout << std::endl;
+      std::copy(expected, expected + 512, Iter);
+      std::cout << std::endl;
+      return false;
+    }
+  }
+  std::cout << "test_blocked_to_warp_striped pass\n";
+  return true;
+}
+
+bool test_warp_striped_to_blocked() {
+  int *d_data, expected[512];
+  cudaMallocManaged(&d_data, sizeof(int) * 512);
+  size_t warp_id = 0, warp_offset = 0, lane_id = 0;
+  for (int i = 0; i < 128; i++) {
+    warp_id = i / 32;
+    lane_id = i % 32;
+    warp_offset = warp_id * 32 * 4;
+    d_data[4 * i + 0] = warp_offset + lane_id + 0 * 32;
+    d_data[4 * i + 1] = warp_offset + lane_id + 1 * 32;
+    d_data[4 * i + 2] = warp_offset + lane_id + 2 * 32;
+    d_data[4 * i + 3] = warp_offset + lane_id + 3 * 32;
+  }
+  // CHECK: q_ct1.submit(
+  // CHECK-NEXT:   [&](sycl::handler &cgh) {
+  // CHECK-NEXT:     sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::exchange<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
+  // CHECK-EMPTY:
+  // CHECK-NEXT:     cgh.parallel_for(
+  // CHECK-NEXT:       sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)), 
+  // CHECK-NEXT:       [=](sycl::nd_item<3> item_ct1) {{\[\[}}sycl::reqd_sub_group_size(32){{\]\]}} {
+  // CHECK-NEXT:         WarpStripedToBlockedKernel(d_data, item_ct1, &temp_storage_acc[0]);
+  // CHECK-NEXT:       });
+  // CHECK-NEXT:   });
+  WarpStripedToBlockedKernel<<<1, 128>>>(d_data);
+  cudaDeviceSynchronize();
+
+  for (int i = 0; i < 512; i++) {
+    expected[i] = i;
+  }
+
+  for (int i = 0; i < 512; ++i) {
+    if (expected[i] != d_data[i]) {
+      std::cout << "test_warp_striped_to_blocked failed\n";
+      std::ostream_iterator<int> Iter(std::cout, ", ");
+      std::copy(d_data, d_data + 512, Iter);
+      std::cout << std::endl;
+      return false;
+    }
+  }
+  std::cout << "test_warp_striped_to_blocked pass\n";
+  return true;
+}
+
 int main() {
   return !(test_blocked_to_striped() && test_striped_to_blocked() &&
-           test_scatter_to_blocked() && test_scatter_to_striped());
+           test_scatter_to_blocked() && test_scatter_to_striped() &&
+           test_blocked_to_warp_striped() && test_warp_striped_to_blocked());
 }