@@ -84,6 +84,34 @@ __global__ void ScatterToStripedKernel(int *d_data, int *d_rank) {
8484 cub::StoreDirectStriped<128 >(threadIdx .x , d_data, thread_data);
8585}
8686
87+ __global__ void BlockedToWarpStripedKernel (int *d_data) {
88+ // CHECK: typedef dpct::group::exchange<int, 4> BlockExchange;
89+ // CHECK: int thread_data[4];
90+ // CHECK: dpct::group::load_direct_blocked(item_ct1, d_data, thread_data);
91+ // CHECK: BlockExchange(temp_storage).blocked_to_sub_group_striped(item_ct1, thread_data, thread_data);
92+ // CHECK: dpct::group::store_direct_blocked(item_ct1, d_data, thread_data);
93+ typedef cub::BlockExchange<int , 128 , 4 > BlockExchange;
94+ __shared__ typename BlockExchange::TempStorage temp_storage;
95+ int thread_data[4 ];
96+ cub::LoadDirectBlocked (threadIdx .x , d_data, thread_data);
97+ BlockExchange (temp_storage).BlockedToWarpStriped (thread_data, thread_data);
98+ cub::StoreDirectBlocked (threadIdx .x , d_data, thread_data);
99+ }
100+
101+ __global__ void WarpStripedToBlockedKernel (int *d_data) {
102+ // CHECK: typedef dpct::group::exchange<int, 4> BlockExchange;
103+ // CHECK: int thread_data[4];
104+ // CHECK: dpct::group::load_direct_blocked(item_ct1, d_data, thread_data);
105+ // CHECK: BlockExchange(temp_storage).sub_group_striped_to_blocked(item_ct1, thread_data, thread_data);
106+ // CHECK: dpct::group::store_direct_blocked(item_ct1, d_data, thread_data);
107+ typedef cub::BlockExchange<int , 128 , 4 > BlockExchange;
108+ __shared__ typename BlockExchange::TempStorage temp_storage;
109+ int thread_data[4 ];
110+ cub::LoadDirectBlocked (threadIdx .x , d_data, thread_data);
111+ BlockExchange (temp_storage).WarpStripedToBlocked (thread_data, thread_data);
112+ cub::StoreDirectBlocked (threadIdx .x , d_data, thread_data);
113+ }
114+
87115bool test_striped_to_blocked () {
88116 int *d_data;
89117 cudaMallocManaged (&d_data, sizeof (int ) * 512 );
@@ -257,7 +285,96 @@ bool test_scatter_to_striped() {
257285 return true ;
258286}
259287
288+ bool test_blocked_to_warp_striped () {
289+ int *d_data, expected[512 ];
290+ cudaMallocManaged (&d_data, sizeof (int ) * 512 );
291+ for (int i = 0 ; i < 512 ; ++i)
292+ d_data[i] = i;
293+
294+
295+ // CHECK: q_ct1.submit(
296+ // CHECK-NEXT: [&](sycl::handler &cgh) {
297+ // CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::exchange<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
298+ // CHECK-EMPTY:
299+ // CHECK-NEXT: cgh.parallel_for(
300+ // CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
301+ // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {{\[\[}}sycl::reqd_sub_group_size(32){{\]\]}} {
302+ // CHECK-NEXT: BlockedToWarpStripedKernel(d_data, item_ct1, &temp_storage_acc[0]);
303+ // CHECK-NEXT: });
304+ // CHECK-NEXT: });
305+ BlockedToWarpStripedKernel<<<1 , 128 >>> (d_data);
306+ cudaDeviceSynchronize ();
307+ size_t warp_id = 0 , warp_offset = 0 , lane_id = 0 ;
308+ for (int i = 0 ; i < 128 ; i++) {
309+ warp_id = i / 32 ;
310+ lane_id = i % 32 ;
311+ warp_offset = warp_id * 32 * 4 ;
312+ expected[4 * i + 0 ] = warp_offset + lane_id + 0 * 32 ;
313+ expected[4 * i + 1 ] = warp_offset + lane_id + 1 * 32 ;
314+ expected[4 * i + 2 ] = warp_offset + lane_id + 2 * 32 ;
315+ expected[4 * i + 3 ] = warp_offset + lane_id + 3 * 32 ;
316+ }
317+
318+ for (int i = 0 ; i < 512 ; ++i) {
319+ if (expected[i] != d_data[i]) {
320+ std::cout << " test_blocked_to_warp_striped failed\n " ;
321+ std::ostream_iterator<int > Iter (std::cout, " , " );
322+ std::copy (d_data, d_data + 512 , Iter);
323+ std::cout << std::endl;
324+ std::copy (expected, expected + 512 , Iter);
325+ std::cout << std::endl;
326+ return false ;
327+ }
328+ }
329+ std::cout << " test_blocked_to_warp_striped pass\n " ;
330+ return true ;
331+ }
332+
333+ bool test_warp_striped_to_blocked () {
334+ int *d_data, expected[512 ];
335+ cudaMallocManaged (&d_data, sizeof (int ) * 512 );
336+ size_t warp_id = 0 , warp_offset = 0 , lane_id = 0 ;
337+ for (int i = 0 ; i < 128 ; i++) {
338+ warp_id = i / 32 ;
339+ lane_id = i % 32 ;
340+ warp_offset = warp_id * 32 * 4 ;
341+ d_data[4 * i + 0 ] = warp_offset + lane_id + 0 * 32 ;
342+ d_data[4 * i + 1 ] = warp_offset + lane_id + 1 * 32 ;
343+ d_data[4 * i + 2 ] = warp_offset + lane_id + 2 * 32 ;
344+ d_data[4 * i + 3 ] = warp_offset + lane_id + 3 * 32 ;
345+ }
346+ // CHECK: q_ct1.submit(
347+ // CHECK-NEXT: [&](sycl::handler &cgh) {
348+ // CHECK-NEXT: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::exchange<int, 4>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
349+ // CHECK-EMPTY:
350+ // CHECK-NEXT: cgh.parallel_for(
351+ // CHECK-NEXT: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
352+ // CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {{\[\[}}sycl::reqd_sub_group_size(32){{\]\]}} {
353+ // CHECK-NEXT: WarpStripedToBlockedKernel(d_data, item_ct1, &temp_storage_acc[0]);
354+ // CHECK-NEXT: });
355+ // CHECK-NEXT: });
356+ WarpStripedToBlockedKernel<<<1 , 128 >>> (d_data);
357+ cudaDeviceSynchronize ();
358+
359+ for (int i = 0 ; i < 512 ; i++) {
360+ expected[i] = i;
361+ }
362+
363+ for (int i = 0 ; i < 512 ; ++i) {
364+ if (expected[i] != d_data[i]) {
365+ std::cout << " test_warp_striped_to_blocked failed\n " ;
366+ std::ostream_iterator<int > Iter (std::cout, " , " );
367+ std::copy (d_data, d_data + 512 , Iter);
368+ std::cout << std::endl;
369+ return false ;
370+ }
371+ }
372+ std::cout << " test_warp_striped_to_blocked pass\n " ;
373+ return true ;
374+ }
375+
260376int main () {
261377 return !(test_blocked_to_striped () && test_striped_to_blocked () &&
262- test_scatter_to_blocked () && test_scatter_to_striped ());
378+ test_scatter_to_blocked () && test_scatter_to_striped () &&
379+ test_blocked_to_warp_striped () && test_warp_striped_to_blocked ());
263380}
0 commit comments