xref: /petsc/src/ksp/ksp/tests/raja/ex1.raja.cxx (revision 4e6a9dc06e4300e5638c1516c9d48be5c9d23df2)
1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4 //
5 // SPDX-License-Identifier: (BSD-3-Clause)
6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
7 
8 #include <cstdlib>
9 #include <cstdio>
10 #include <cstring>
11 
12 #include <iostream>
13 #include <cmath>
14 
15 #include "RAJA/RAJA.hpp"
16 
17 #include "memoryManager.hpp"
18 
19 /*
20  * Jacobi Example
21  *
22  * ----[Details]--------------------
23  * This code uses a five point finite difference stencil
24  * to discretize the following boundary value problem
25  *
26  * U_xx + U_yy = f on [0,1] x [0,1].
27  *
28  * The right-hand side is chosen to be
29  * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30  *
31  * A structured grid is used to discretize the domain
32  * [0,1] x [0,1]. Values inside the domain are computed
33  * using the Jacobi method to solve the associated
34  * linear system. The scheme is invoked until the l_2
35  * difference of subsequent iterations is below a
36  * tolerance.
37  *
38  * The scheme is implemented by allocating two arrays
39  * (I, Iold) and initialized to zero. The first set of
40  * nested for loops apply an iteration of the Jacobi
41  * scheme. The scheme is only applied to the interior
42  * nodes.
43  *
44  * The second set of nested for loops is used to
45  * update Iold and compute the l_2 norm of the
46  * difference of the iterates.
47  *
48  * Computing the l_2 norm requires a reduction operation.
49  * To simplify the reduction procedure, the RAJA API
50  * introduces thread safe variables.
51  *
52  * ----[RAJA Concepts]---------------
53  * - Forall::nested loop
54  * - RAJA Reduction
55  *
56  */
57 
58 /*
59  *  ----[Constant Values]-----
60  * CUDA_BLOCK_SIZE_X - Number of threads in the
61  *                     x-dimension of a cuda thread block
62  *
63  * CUDA_BLOCK_SIZE_Y - Number of threads in the
64  *                     y-dimension of a cuda thread block
65  *
66  * CUDA_BLOCK_SIZE   - Number of threads per threads block
67 */
68 #if defined(RAJA_ENABLE_CUDA)
69 const int CUDA_BLOCK_SIZE = 256;
70 #endif
71 
72 #if defined(RAJA_ENABLE_HIP)
73 const int HIP_BLOCK_SIZE = 256;
74 #endif
75 
76 //
77 //  Struct to hold grid info
78 //  o - Origin in a cartesian dimension
79 //  h - Spacing between grid points
80 //  n - Number of grid points
81 //
82 struct grid_s {
83   double o, h;
84   int n;
85 };
86 
87 //
88 // ----[Functions]---------
89 // solution   - Function for the analytic solution
90 // computeErr - Displays the maximum error in the solution
91 //
92 double solution(double x, double y);
93 void computeErr(double *I, grid_s grid);
94 
95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
96 {
97 
98   std::cout<<"Jacobi Example"<<std::endl;
99 
100   /*
101    * ----[Solver Parameters]------------
102    * tol       - Method terminates once the norm is less than tol
103    * N         - Number of unknown gridpoints per cartesian dimension
104    * NN        - Total number of gridpoints on the grid
105    * maxIter   - Maximum number of iterations to be taken
106    *
107    * resI2     - Residual
108    * iteration - Iteration number
109    * grid_s    - Struct with grid information for a cartesian dimension
110   */
111   double tol = 1e-10;
112 
113   int N = 50;
114   int NN = (N + 2) * (N + 2);
115   int maxIter = 100000;
116 
117   double resI2;
118   int iteration;
119 
120   grid_s gridx;
121   gridx.o = 0.0;
122   gridx.h = 1.0 / (N + 1.0);
123   gridx.n = N + 2;
124 
125   //
126   //I, Iold - Holds iterates of Jacobi method
127   //
128   double *I = memoryManager::allocate<double>(NN);
129   double *Iold = memoryManager::allocate<double>(NN);
130 
131   memset(I, 0, NN * sizeof(double));
132   memset(Iold, 0, NN * sizeof(double));
133 
134   printf("Standard  C++ Loop \n");
135   resI2 = 1;
136   iteration = 0;
137 
138   while (resI2 > tol * tol) {
139 
140     //
141     // Jacobi Iteration
142     //
143     for (int n = 1; n <= N; ++n) {
144       for (int m = 1; m <= N; ++m) {
145 
146         double x = gridx.o + m * gridx.h;
147         double y = gridx.o + n * gridx.h;
148 
149         double f = gridx.h * gridx.h
150                    * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
151 
152         int id = n * (N + 2) + m;
153         I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
154                            + Iold[id + 1]);
155       }
156     }
157 
158     //
159     // Compute residual and update Iold
160     //
161     resI2 = 0.0;
162     for (int k = 0; k < NN; k++) {
163       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
164       Iold[k] = I[k];
165     }
166 
167     if (iteration > maxIter) {
168       printf("Standard C++ Loop - Maxed out on iterations \n");
169       exit(-1);
170     }
171 
172     iteration++;
173   }
174   computeErr(I, gridx);
175   printf("No of iterations: %d \n \n", iteration);
176 
177   //
178   // RAJA loop calls may be shortened by predefining policies
179   //
180   RAJA::RangeSegment gridRange(0, NN);
181   RAJA::RangeSegment jacobiRange(1, (N + 1));
182 
183   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
184   RAJA::statement::For<1, RAJA::seq_exec,
185     RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >;
186 
187   printf("RAJA: Sequential Policy - Nested ForallN \n");
188   resI2 = 1;
189   iteration = 0;
190   memset(I, 0, NN * sizeof(double));
191   memset(Iold, 0, NN * sizeof(double));
192 
193   /*
194    *  Sequential Jacobi Iteration.
195    *
196    *  Note that a RAJA ReduceSum object is used to accumulate the sum
197    *  for the residual. Since the loop is run sequentially, this is
198    *  not strictly necessary. It is done here for consistency and
199    *  comparison with other RAJA variants in this example.
200    */
201   while (resI2 > tol * tol) {
202 
203     RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
204                          [=] (RAJA::Index_type m, RAJA::Index_type n) {
205 
206           double x = gridx.o + m * gridx.h;
207           double y = gridx.o + n * gridx.h;
208 
209           double f = gridx.h * gridx.h
210                      * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
211 
212           int id = n * (N + 2) + m;
213           I[id] =
214                0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
215                           + Iold[id + 1]);
216         });
217 
218     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
219     RAJA::forall<RAJA::seq_exec>(
220       gridRange, [=](RAJA::Index_type k) {
221 
222         RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
223         Iold[k] = I[k];
224 
225       });
226 
227     resI2 = RAJA_resI2;
228     if (iteration > maxIter) {
229       printf("Jacobi: Sequential - Maxed out on iterations! \n");
230       exit(-1);
231     }
232     iteration++;
233   }
234   computeErr(I, gridx);
235   printf("No of iterations: %d \n \n", iteration);
236 
237 #if defined(RAJA_ENABLE_OPENMP)
238   printf("RAJA: OpenMP Policy - Nested ForallN \n");
239   resI2 = 1;
240   iteration = 0;
241   memset(I, 0, NN * sizeof(double));
242   memset(Iold, 0, NN * sizeof(double));
243 
244   /*
245    *  OpenMP parallel Jacobi Iteration.
246    *
247    *  ----[RAJA Policies]-----------
248    *  RAJA::omp_collapse_for_exec -
249    *  introduced a nested region
250    *
251    *  Note that OpenMP RAJA ReduceSum object performs the reduction
252    *  operation for the residual in a thread-safe manner.
253    */
254 
255   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<
256       RAJA::statement::For<1, RAJA::omp_parallel_for_exec,
257         RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
258 
259   while (resI2 > tol * tol) {
260 
261     RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange),
262                          [=] (RAJA::Index_type m, RAJA::Index_type n) {
263 
264       double x = gridx.o + m * gridx.h;
265       double y = gridx.o + n * gridx.h;
266 
267       double f = gridx.h * gridx.h *
268                  (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
269 
270       int id = n * (N + 2) + m;
271       I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] +
272                            Iold[id - 1] + Iold[id + 1]);
273     });
274 
275     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
276 
277     RAJA::forall<RAJA::omp_parallel_for_exec>( gridRange,
278       [=](RAJA::Index_type k) {
279 
280       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
281       Iold[k] = I[k];
282 
283     });
284 
285     resI2 = RAJA_resI2;
286     if (iteration > maxIter) {
287       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
288       exit(-1);
289     }
290     iteration++;
291   }
292   computeErr(I, gridx);
293   printf("No of iterations: %d \n \n", iteration);
294 #endif
295 
296 #if defined(RAJA_ENABLE_CUDA)
297   /*
298    *  CUDA Jacobi Iteration.
299    *
300    *  ----[RAJA Policies]-----------
301    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
302    *  define the mapping of loop iterations to GPU thread blocks
303    *
304    *  Note that CUDA RAJA ReduceSum object performs the reduction
305    *  operation for the residual in a thread-safe manner on the GPU.
306    */
307 
308   printf("RAJA: CUDA Policy - Nested ForallN \n");
309 
310   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<
311     RAJA::statement::CudaKernel<
312       RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop,
313         RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop,
314           RAJA::statement::For<1, RAJA::cuda_thread_y_direct,
315             RAJA::statement::For<0, RAJA::cuda_thread_x_direct,
316               RAJA::statement::Lambda<0>
317             >
318           >
319         >
320       >
321     > >;
322 
323   resI2 = 1;
324   iteration = 0;
325   memset(I, 0, NN * sizeof(double));
326   memset(Iold, 0, NN * sizeof(double));
327 
328   while (resI2 > tol * tol) {
329 
330     //
331     // Jacobi Iteration
332     //
333     RAJA::kernel<jacobiCUDANestedPolicy>(
334                          RAJA::make_tuple(jacobiRange,jacobiRange),
335                          [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {
336 
337           double x = gridx.o + m * gridx.h;
338           double y = gridx.o + n * gridx.h;
339 
340           double f = gridx.h * gridx.h
341                      * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
342 
343           int id = n * (N + 2) + m;
344           I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1]
345                              + Iold[id + 1]);
346         });
347 
348     //
349     // Compute residual and update Iold
350     //
351     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
352     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
353       gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
354 
355           RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
356           Iold[k] = I[k];
357 
358       });
359 
360     resI2 = RAJA_resI2;
361 
362     if (iteration > maxIter) {
363       printf("RAJA: CUDA - Maxed out on iterations! \n");
364       exit(-1);
365     }
366     iteration++;
367   }
368   cudaDeviceSynchronize();
369   computeErr(I, gridx);
370   printf("No of iterations: %d \n \n", iteration);
371 #endif
372 
373 #if defined(RAJA_ENABLE_HIP)
374   /*
375    *  HIP Jacobi Iteration.
376    *
377    *  ----[RAJA Policies]-----------
378    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
379    *  define the mapping of loop iterations to GPU thread blocks
380    *
381    *  Note that HIP RAJA ReduceSum object performs the reduction
382    *  operation for the residual in a thread-safe manner on the GPU.
383    */
384 
385   printf("RAJA: HIP Policy - Nested ForallN \n");
386 
387   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<
388     RAJA::statement::HipKernel<
389       RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop,
390         RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop,
391           RAJA::statement::For<1, RAJA::hip_thread_y_direct,
392             RAJA::statement::For<0, RAJA::hip_thread_x_direct,
393               RAJA::statement::Lambda<0>
394             >
395           >
396         >
397       >
398     > >;
399 
400   resI2 = 1;
401   iteration = 0;
402   memset(I, 0, NN * sizeof(double));
403   memset(Iold, 0, NN * sizeof(double));
404 
405   double *d_I    = memoryManager::allocate_gpu<double>(NN);
406   double *d_Iold = memoryManager::allocate_gpu<double>(NN);
407   hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
408   hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
409 
410   while (resI2 > tol * tol) {
411 
412     //
413     // Jacobi Iteration
414     //
415     RAJA::kernel<jacobiHIPNestedPolicy>(
416                          RAJA::make_tuple(jacobiRange,jacobiRange),
417                          [=] RAJA_DEVICE  (RAJA::Index_type m, RAJA::Index_type n) {
418 
419           double x = gridx.o + m * gridx.h;
420           double y = gridx.o + n * gridx.h;
421 
422           double f = gridx.h * gridx.h
423                      * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
424 
425           int id = n * (N + 2) + m;
426           d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1]
427                              + d_Iold[id + 1]);
428         });
429 
430     //
431     // Compute residual and update Iold
432     //
433     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
434     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
435       gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) {
436 
437           RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
438           d_Iold[k] = d_I[k];
439 
440       });
441 
442     resI2 = RAJA_resI2;
443 
444     if (iteration > maxIter) {
445       printf("RAJA: HIP - Maxed out on iterations! \n");
446       exit(-1);
447     }
448     iteration++;
449   }
450   hipDeviceSynchronize();
451   hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
452   computeErr(I, gridx);
453   printf("No of iterations: %d \n \n", iteration);
454 
455   memoryManager::deallocate_gpu(d_I);
456   memoryManager::deallocate_gpu(d_Iold);
457 #endif
458 
459   memoryManager::deallocate(I);
460   memoryManager::deallocate(Iold);
461 
462   return 0;
463 }
464 
465 //
466 // Function for the anlytic solution
467 //
468 double solution(double x, double y)
469 {
470   return x * y * exp(x - y) * (1 - x) * (1 - y);
471 }
472 
473 //
474 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
475 //
476 void computeErr(double *I, grid_s grid)
477 {
478 
479   RAJA::RangeSegment gridRange(0, grid.n);
480   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
481 
482   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<
483     RAJA::statement::For<1, RAJA::seq_exec,
484       RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >;
485 
486   RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange,gridRange),
487                        [=] (RAJA::Index_type ty, RAJA::Index_type tx) {
488 
489       int id = tx + grid.n * ty;
490       double x = grid.o + tx * grid.h;
491       double y = grid.o + ty * grid.h;
492       double myErr = std::abs(I[id] - solution(x, y));
493       tMax.max(myErr);
494     });
495 
496   double l2err = tMax;
497   printf("Max error = %lg, h = %f \n", l2err, grid.h);
498 }
499 
500 /*TEST
501 
502     test:
503       requires: raja !cuda
504 
505     test:
506       suffix: 2
507       requires: raja cuda
508 
509 TEST*/
510