@@ -127,12 +127,10 @@ int main(int argc, char *argv[])
127127 t_start = MPI_Wtime ();
128128 }
129129
130- for (int i = 0 ; i < Niter; ++i)
131- {
132- MPI_Win cwin = win[(i + 1 ) % 2 ];
133- double *a = A_device[i % 2 ];
134- double *a_out = A_device[(i + 1 ) % 2 ];
130+ int iterations_batch = (NormIteration <= 0 ) ? Niter : NormIteration;
131+ for (passed_iters = 0 ; passed_iters < Niter; passed_iters += iterations_batch) {
135132
133+ /* Submit compute kernel to calculate next "iterations_batch" steps */
136134 q.submit ([&](auto & h) {
137135 h.parallel_for (sycl::nd_range<1 >(work_group_size, work_group_size),
138136 [=](sycl::nd_item<1 > item) {
@@ -143,52 +141,66 @@ int main(int argc, char *argv[])
143141 int my_x_lb = col_per_wg * local_id;
144142 int my_x_ub = my_x_lb + col_per_wg;
145143
146- /* Calculate values on borders to initiate communications early */
147- for (int column = my_x_lb; column < my_x_ub; column ++) {
148- int idx = XY_2_IDX (column, 0 , my_subarray);
149- a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
150- + a[idx - ROW_SIZE (my_subarray)]
151- + a[idx + ROW_SIZE (my_subarray)]);
152- idx = XY_2_IDX (column, my_subarray.y_size - 1 , my_subarray);
153- a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
154- + a[idx - ROW_SIZE (my_subarray)]
155- + a[idx + ROW_SIZE (my_subarray)]);
144+ for (int k = 0 ; k < iterations_batch; ++k)
145+ {
146+ int i = passed_iters + k;
147+ MPI_Win cwin = win[(i + 1 ) % 2 ];
148+ double *a = A_device[i % 2 ];
149+ double *a_out = A_device[(i + 1 ) % 2 ];
150+ /* Calculate values on borders to initiate communications early */
151+ for (int column = my_x_lb; column < my_x_ub; column ++) {
152+ int idx = XY_2_IDX (column, 0 , my_subarray);
153+ a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
154+ + a[idx - ROW_SIZE (my_subarray)]
155+ + a[idx + ROW_SIZE (my_subarray)]);
156+ idx = XY_2_IDX (column, my_subarray.y_size - 1 , my_subarray);
157+ a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
158+ + a[idx - ROW_SIZE (my_subarray)]
159+ + a[idx + ROW_SIZE (my_subarray)]);
160+ }
161+
162+ item.barrier (sycl::access::fence_space::global_space);
163+ if (local_id == 0 ) {
164+ /* Perform 1D halo-exchange with neighbours */
165+ if (my_subarray.rank != 0 ) {
166+ int idx = XY_2_IDX (0 , 0 , my_subarray);
167+ MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
168+ my_subarray.rank - 1 , my_subarray.l_nbh_offt ,
169+ my_subarray.x_size , MPI_DOUBLE, cwin);
170+ }
171+
172+ if (my_subarray.rank != (my_subarray.comm_size - 1 )) {
173+ int idx = XY_2_IDX (0 , my_subarray.y_size - 1 , my_subarray);
174+ MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
175+ my_subarray.rank + 1 , 1 ,
176+ my_subarray.x_size , MPI_DOUBLE, cwin);
177+ }
178+ }
179+
180+ /* Recalculate internal points in parallel with comunications */
181+ for (int row = 1 ; row < my_subarray.y_size - 1 ; ++row) {
182+ for (int column = my_x_lb; column < my_x_ub; column ++) {
183+ int idx = XY_2_IDX (column, row, my_subarray);
184+ a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
185+ + a[idx - ROW_SIZE (my_subarray)]
186+ + a[idx + ROW_SIZE (my_subarray)]);
187+ }
188+ }
189+ item.barrier (sycl::access::fence_space::global_space);
190+ /* Ensure all communications complete before next iteration */
191+ if (local_id == 0 ) {
192+ MPI_Win_fence (0 , cwin);
193+ }
194+ item.barrier (sycl::access::fence_space::global_space);
156195 }
157-
158- item.barrier (sycl::access::fence_space::global_space);
159- if (local_id == 0 ) {
160- /* Perform 1D halo-exchange with neighbours */
161- if (my_subarray.rank != 0 ) {
162- int idx = XY_2_IDX (0 , 0 , my_subarray);
163- MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
164- my_subarray.rank - 1 , my_subarray.l_nbh_offt ,
165- my_subarray.x_size , MPI_DOUBLE, cwin);
166- }
167-
168- if (my_subarray.rank != (my_subarray.comm_size - 1 )) {
169- int idx = XY_2_IDX (0 , my_subarray.y_size - 1 , my_subarray);
170- MPI_Put (&a_out[idx], my_subarray.x_size , MPI_DOUBLE,
171- my_subarray.rank + 1 , 1 ,
172- my_subarray.x_size , MPI_DOUBLE, cwin);
173- }
174- }
175-
176- /* Recalculate internal points in parallel with comunications */
177- for (int row = 1 ; row < my_subarray.y_size - 1 ; ++row) {
178- for (int column = my_x_lb; column < my_x_ub; column ++) {
179- int idx = XY_2_IDX (column, row, my_subarray);
180- a_out[idx] = 0.25 * (a[idx - 1 ] + a[idx + 1 ]
181- + a[idx - ROW_SIZE (my_subarray)]
182- + a[idx + ROW_SIZE (my_subarray)]);
183- }
184- }
185- item.barrier (sycl::access::fence_space::global_space);
186196 });
187197 }).wait ();
188198
199+
189200 /* Calculate and report norm value after given number of iterations */
190201 if ((NormIteration > 0 ) && ((NormIteration - 1 ) == i % NormIteration)) {
191202 double rank_norm = 0.0 ;
203+
192204 {
193205 sycl::buffer<double > norm_buf (&rank_norm, 1 );
194206 q.submit ([&](auto & h) {
@@ -204,12 +216,9 @@ int main(int argc, char *argv[])
204216 /* Get global norm value */
205217 MPI_Reduce (&rank_norm, &norm, 1 , MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD);
206218 if (my_subarray.rank == 0 ) {
207- printf (" NORM value on iteration %d: %f\n " , passed_iters + batch_iters + 1 , sqrt (norm));
219+ printf (" NORM value on iteration %d: %f\n " , i+ 1 , sqrt (norm));
208220 }
209- rank_norm = 0.0 ;
210221 }
211- /* Ensure all communications complete before next iteration */
212- MPI_Win_fence (0 , cwin);
213222 }
214223
215224 if (PrintTime) {
0 commit comments