xref: /petsc/src/ksp/ksp/tests/raja/ex1.raja.cxx (revision df341386820a1dd57736366587aa63ba75afe475)
1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4 //
5 // SPDX-License-Identifier: (BSD-3-Clause)
6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
7 
8 #include <cstdlib>
9 #include <cstdio>
10 #include <cstring>
11 
12 #include <iostream>
13 #include <cmath>
14 
15 #include "RAJA/RAJA.hpp"
16 
17 #include "memoryManager.hpp"
18 
19 /*
20  * Jacobi Example
21  *
22  * ----[Details]--------------------
23  * This code uses a five point finite difference stencil
24  * to discretize the following boundary value problem
25  *
26  * U_xx + U_yy = f on [0,1] x [0,1].
27  *
28  * The right-hand side is chosen to be
29  * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30  *
31  * A structured grid is used to discretize the domain
32  * [0,1] x [0,1]. Values inside the domain are computed
33  * using the Jacobi method to solve the associated
34  * linear system. The scheme is invoked until the l_2
35  * difference of subsequent iterations is below a
36  * tolerance.
37  *
38  * The scheme is implemented by allocating two arrays
39  * (I, Iold) and initialized to zero. The first set of
40  * nested for loops apply an iteration of the Jacobi
41  * scheme. The scheme is only applied to the interior
42  * nodes.
43  *
44  * The second set of nested for loops is used to
45  * update Iold and compute the l_2 norm of the
46  * difference of the iterates.
47  *
48  * Computing the l_2 norm requires a reduction operation.
49  * To simplify the reduction procedure, the RAJA API
50  * introduces thread safe variables.
51  *
52  * ----[RAJA Concepts]---------------
53  * - Forall::nested loop
54  * - RAJA Reduction
55  *
56  */
57 
58 /*
59  *  ----[Constant Values]-----
60  * CUDA_BLOCK_SIZE_X - Number of threads in the
61  *                     x-dimension of a cuda thread block
62  *
63  * CUDA_BLOCK_SIZE_Y - Number of threads in the
64  *                     y-dimension of a cuda thread block
65  *
66  * CUDA_BLOCK_SIZE   - Number of threads per threads block
67 */
68 #if defined(RAJA_ENABLE_CUDA)
69 const int CUDA_BLOCK_SIZE = 256;
70 #endif
71 
72 #if defined(RAJA_ENABLE_HIP)
73 const int HIP_BLOCK_SIZE = 256;
74 #endif
75 
76 //
77 //  Struct to hold grid info
78 //  o - Origin in a cartesian dimension
79 //  h - Spacing between grid points
80 //  n - Number of grid points
81 //
82 struct grid_s {
83   double o, h;
84   int    n;
85 };
86 
87 //
88 // ----[Functions]---------
89 // solution   - Function for the analytic solution
90 // computeErr - Displays the maximum error in the solution
91 //
92 double solution(double x, double y);
93 void   computeErr(double *I, grid_s grid);
94 
95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) {
96   std::cout << "Jacobi Example" << std::endl;
97 
98   /*
99    * ----[Solver Parameters]------------
100    * tol       - Method terminates once the norm is less than tol
101    * N         - Number of unknown gridpoints per cartesian dimension
102    * NN        - Total number of gridpoints on the grid
103    * maxIter   - Maximum number of iterations to be taken
104    *
105    * resI2     - Residual
106    * iteration - Iteration number
107    * grid_s    - Struct with grid information for a cartesian dimension
108   */
109   double tol = 1e-10;
110 
111   int N       = 50;
112   int NN      = (N + 2) * (N + 2);
113   int maxIter = 100000;
114 
115   double resI2;
116   int    iteration;
117 
118   grid_s gridx;
119   gridx.o = 0.0;
120   gridx.h = 1.0 / (N + 1.0);
121   gridx.n = N + 2;
122 
123   //
124   //I, Iold - Holds iterates of Jacobi method
125   //
126   double *I    = memoryManager::allocate<double>(NN);
127   double *Iold = memoryManager::allocate<double>(NN);
128 
129   memset(I, 0, NN * sizeof(double));
130   memset(Iold, 0, NN * sizeof(double));
131 
132   printf("Standard  C++ Loop \n");
133   resI2     = 1;
134   iteration = 0;
135 
136   while (resI2 > tol * tol) {
137     //
138     // Jacobi Iteration
139     //
140     for (int n = 1; n <= N; ++n) {
141       for (int m = 1; m <= N; ++m) {
142         double x = gridx.o + m * gridx.h;
143         double y = gridx.o + n * gridx.h;
144 
145         double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
146 
147         int id = n * (N + 2) + m;
148         I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
149       }
150     }
151 
152     //
153     // Compute residual and update Iold
154     //
155     resI2 = 0.0;
156     for (int k = 0; k < NN; k++) {
157       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
158       Iold[k] = I[k];
159     }
160 
161     if (iteration > maxIter) {
162       printf("Standard C++ Loop - Maxed out on iterations \n");
163       exit(-1);
164     }
165 
166     iteration++;
167   }
168   computeErr(I, gridx);
169   printf("No of iterations: %d \n \n", iteration);
170 
171   //
172   // RAJA loop calls may be shortened by predefining policies
173   //
174   RAJA::RangeSegment gridRange(0, NN);
175   RAJA::RangeSegment jacobiRange(1, (N + 1));
176 
177   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
178 
179   printf("RAJA: Sequential Policy - Nested ForallN \n");
180   resI2     = 1;
181   iteration = 0;
182   memset(I, 0, NN * sizeof(double));
183   memset(Iold, 0, NN * sizeof(double));
184 
185   /*
186    *  Sequential Jacobi Iteration.
187    *
188    *  Note that a RAJA ReduceSum object is used to accumulate the sum
189    *  for the residual. Since the loop is run sequentially, this is
190    *  not strictly necessary. It is done here for consistency and
191    *  comparison with other RAJA variants in this example.
192    */
193   while (resI2 > tol * tol) {
194     RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
195       double x = gridx.o + m * gridx.h;
196       double y = gridx.o + n * gridx.h;
197 
198       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
199 
200       int id = n * (N + 2) + m;
201       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
202     });
203 
204     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
205     RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
206       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
207       Iold[k] = I[k];
208     });
209 
210     resI2 = RAJA_resI2;
211     if (iteration > maxIter) {
212       printf("Jacobi: Sequential - Maxed out on iterations! \n");
213       exit(-1);
214     }
215     iteration++;
216   }
217   computeErr(I, gridx);
218   printf("No of iterations: %d \n \n", iteration);
219 
220 #if defined(RAJA_ENABLE_OPENMP)
221   printf("RAJA: OpenMP Policy - Nested ForallN \n");
222   resI2     = 1;
223   iteration = 0;
224   memset(I, 0, NN * sizeof(double));
225   memset(Iold, 0, NN * sizeof(double));
226 
227   /*
228    *  OpenMP parallel Jacobi Iteration.
229    *
230    *  ----[RAJA Policies]-----------
231    *  RAJA::omp_collapse_for_exec -
232    *  introduced a nested region
233    *
234    *  Note that OpenMP RAJA ReduceSum object performs the reduction
235    *  operation for the residual in a thread-safe manner.
236    */
237 
238   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
239 
240   while (resI2 > tol * tol) {
241     RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
242       double x = gridx.o + m * gridx.h;
243       double y = gridx.o + n * gridx.h;
244 
245       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
246 
247       int id = n * (N + 2) + m;
248       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
249     });
250 
251     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
252 
253     RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
254       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
255       Iold[k] = I[k];
256     });
257 
258     resI2 = RAJA_resI2;
259     if (iteration > maxIter) {
260       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
261       exit(-1);
262     }
263     iteration++;
264   }
265   computeErr(I, gridx);
266   printf("No of iterations: %d \n \n", iteration);
267 #endif
268 
269 #if defined(RAJA_ENABLE_CUDA)
270   /*
271    *  CUDA Jacobi Iteration.
272    *
273    *  ----[RAJA Policies]-----------
274    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
275    *  define the mapping of loop iterations to GPU thread blocks
276    *
277    *  Note that CUDA RAJA ReduceSum object performs the reduction
278    *  operation for the residual in a thread-safe manner on the GPU.
279    */
280 
281   printf("RAJA: CUDA Policy - Nested ForallN \n");
282 
283   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
284 
285   resI2     = 1;
286   iteration = 0;
287   memset(I, 0, NN * sizeof(double));
288   memset(Iold, 0, NN * sizeof(double));
289 
290   while (resI2 > tol * tol) {
291     //
292     // Jacobi Iteration
293     //
294     RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
295       double x = gridx.o + m * gridx.h;
296       double y = gridx.o + n * gridx.h;
297 
298       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
299 
300       int id = n * (N + 2) + m;
301       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
302     });
303 
304     //
305     // Compute residual and update Iold
306     //
307     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
308     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
309       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
310       Iold[k] = I[k];
311     });
312 
313     resI2 = RAJA_resI2;
314 
315     if (iteration > maxIter) {
316       printf("RAJA: CUDA - Maxed out on iterations! \n");
317       exit(-1);
318     }
319     iteration++;
320   }
321   cudaDeviceSynchronize();
322   computeErr(I, gridx);
323   printf("No of iterations: %d \n \n", iteration);
324 #endif
325 
326 #if defined(RAJA_ENABLE_HIP)
327   /*
328    *  HIP Jacobi Iteration.
329    *
330    *  ----[RAJA Policies]-----------
331    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
332    *  define the mapping of loop iterations to GPU thread blocks
333    *
334    *  Note that HIP RAJA ReduceSum object performs the reduction
335    *  operation for the residual in a thread-safe manner on the GPU.
336    */
337 
338   printf("RAJA: HIP Policy - Nested ForallN \n");
339 
340   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
341 
342   resI2     = 1;
343   iteration = 0;
344   memset(I, 0, NN * sizeof(double));
345   memset(Iold, 0, NN * sizeof(double));
346 
347   double *d_I    = memoryManager::allocate_gpu<double>(NN);
348   double *d_Iold = memoryManager::allocate_gpu<double>(NN);
349   hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
350   hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
351 
352   while (resI2 > tol * tol) {
353     //
354     // Jacobi Iteration
355     //
356     RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
357       double x = gridx.o + m * gridx.h;
358       double y = gridx.o + n * gridx.h;
359 
360       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
361 
362       int id  = n * (N + 2) + m;
363       d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
364     });
365 
366     //
367     // Compute residual and update Iold
368     //
369     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
370     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
371       RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
372       d_Iold[k] = d_I[k];
373     });
374 
375     resI2 = RAJA_resI2;
376 
377     if (iteration > maxIter) {
378       printf("RAJA: HIP - Maxed out on iterations! \n");
379       exit(-1);
380     }
381     iteration++;
382   }
383   hipDeviceSynchronize();
384   hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
385   computeErr(I, gridx);
386   printf("No of iterations: %d \n \n", iteration);
387 
388   memoryManager::deallocate_gpu(d_I);
389   memoryManager::deallocate_gpu(d_Iold);
390 #endif
391 
392   memoryManager::deallocate(I);
393   memoryManager::deallocate(Iold);
394 
395   return 0;
396 }
397 
398 //
399 // Function for the anlytic solution
400 //
401 double solution(double x, double y) {
402   return x * y * exp(x - y) * (1 - x) * (1 - y);
403 }
404 
405 //
406 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
407 //
408 void computeErr(double *I, grid_s grid) {
409   RAJA::RangeSegment                        gridRange(0, grid.n);
410   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
411 
412   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
413 
414   RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
415     int    id    = tx + grid.n * ty;
416     double x     = grid.o + tx * grid.h;
417     double y     = grid.o + ty * grid.h;
418     double myErr = std::abs(I[id] - solution(x, y));
419     tMax.max(myErr);
420   });
421 
422   double l2err = tMax;
423   printf("Max error = %lg, h = %f \n", l2err, grid.h);
424 }
425 
426 /*TEST
427 
428     test:
429       requires: raja !cuda
430 
431     test:
432       suffix: 2
433       requires: raja cuda
434 
435 TEST*/
436