xref: /petsc/src/ksp/ksp/tests/raja/ex1.raja.cxx (revision bd9d6ae223073035433861335ae9dbc9b7be559f)
1d23021a0SBarry Smith //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2d23021a0SBarry Smith // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3d23021a0SBarry Smith // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4d23021a0SBarry Smith //
5d23021a0SBarry Smith // SPDX-License-Identifier: (BSD-3-Clause)
6d23021a0SBarry Smith //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
7d23021a0SBarry Smith 
8d23021a0SBarry Smith #include <cstdlib>
9d23021a0SBarry Smith #include <cstdio>
10d23021a0SBarry Smith #include <cstring>
11d23021a0SBarry Smith 
12d23021a0SBarry Smith #include <iostream>
13d23021a0SBarry Smith #include <cmath>
14d23021a0SBarry Smith 
15d23021a0SBarry Smith #include "RAJA/RAJA.hpp"
16d23021a0SBarry Smith 
17d23021a0SBarry Smith #include "memoryManager.hpp"
18d23021a0SBarry Smith 
19d23021a0SBarry Smith /*
20d23021a0SBarry Smith  * Jacobi Example
21d23021a0SBarry Smith  *
22d23021a0SBarry Smith  * ----[Details]--------------------
23d23021a0SBarry Smith  * This code uses a five point finite difference stencil
24d23021a0SBarry Smith  * to discretize the following boundary value problem
25d23021a0SBarry Smith  *
26d23021a0SBarry Smith  * U_xx + U_yy = f on [0,1] x [0,1].
27d23021a0SBarry Smith  *
28d23021a0SBarry Smith  * The right-hand side is chosen to be
29d23021a0SBarry Smith  * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30d23021a0SBarry Smith  *
31d23021a0SBarry Smith  * A structured grid is used to discretize the domain
32d23021a0SBarry Smith  * [0,1] x [0,1]. Values inside the domain are computed
33d23021a0SBarry Smith  * using the Jacobi method to solve the associated
34d23021a0SBarry Smith  * linear system. The scheme is invoked until the l_2
35d23021a0SBarry Smith  * difference of subsequent iterations is below a
36d23021a0SBarry Smith  * tolerance.
37d23021a0SBarry Smith  *
38d23021a0SBarry Smith  * The scheme is implemented by allocating two arrays
39d23021a0SBarry Smith  * (I, Iold) and initialized to zero. The first set of
40d23021a0SBarry Smith  * nested for loops apply an iteration of the Jacobi
41d23021a0SBarry Smith  * scheme. The scheme is only applied to the interior
42d23021a0SBarry Smith  * nodes.
43d23021a0SBarry Smith  *
44d23021a0SBarry Smith  * The second set of nested for loops is used to
45d23021a0SBarry Smith  * update Iold and compute the l_2 norm of the
46d23021a0SBarry Smith  * difference of the iterates.
47d23021a0SBarry Smith  *
48d23021a0SBarry Smith  * Computing the l_2 norm requires a reduction operation.
49d23021a0SBarry Smith  * To simplify the reduction procedure, the RAJA API
50d23021a0SBarry Smith  * introduces thread safe variables.
51d23021a0SBarry Smith  *
52d23021a0SBarry Smith  * ----[RAJA Concepts]---------------
53d23021a0SBarry Smith  * - Forall::nested loop
54d23021a0SBarry Smith  * - RAJA Reduction
55d23021a0SBarry Smith  *
56d23021a0SBarry Smith  */
57d23021a0SBarry Smith 
58d23021a0SBarry Smith /*
59d23021a0SBarry Smith  *  ----[Constant Values]-----
60d23021a0SBarry Smith  * CUDA_BLOCK_SIZE_X - Number of threads in the
61d23021a0SBarry Smith  *                     x-dimension of a cuda thread block
62d23021a0SBarry Smith  *
63d23021a0SBarry Smith  * CUDA_BLOCK_SIZE_Y - Number of threads in the
64d23021a0SBarry Smith  *                     y-dimension of a cuda thread block
65d23021a0SBarry Smith  *
66d23021a0SBarry Smith  * CUDA_BLOCK_SIZE   - Number of threads per threads block
67d23021a0SBarry Smith */
68d23021a0SBarry Smith #if defined(RAJA_ENABLE_CUDA)
69d23021a0SBarry Smith const int CUDA_BLOCK_SIZE = 256;
70d23021a0SBarry Smith #endif
71d23021a0SBarry Smith 
72d23021a0SBarry Smith #if defined(RAJA_ENABLE_HIP)
73d23021a0SBarry Smith const int HIP_BLOCK_SIZE = 256;
74d23021a0SBarry Smith #endif
75d23021a0SBarry Smith 
76d23021a0SBarry Smith //
77d23021a0SBarry Smith //  Struct to hold grid info
78d23021a0SBarry Smith //  o - Origin in a cartesian dimension
79d23021a0SBarry Smith //  h - Spacing between grid points
80d23021a0SBarry Smith //  n - Number of grid points
81d23021a0SBarry Smith //
82d23021a0SBarry Smith struct grid_s {
83d23021a0SBarry Smith   double o, h;
84d23021a0SBarry Smith   int    n;
85d23021a0SBarry Smith };
86d23021a0SBarry Smith 
87d23021a0SBarry Smith //
88d23021a0SBarry Smith // ----[Functions]---------
89d23021a0SBarry Smith // solution   - Function for the analytic solution
90d23021a0SBarry Smith // computeErr - Displays the maximum error in the solution
91d23021a0SBarry Smith //
92d23021a0SBarry Smith double solution(double x, double y);
93d23021a0SBarry Smith void   computeErr(double *I, grid_s grid);
94d23021a0SBarry Smith 
95d71ae5a4SJacob Faibussowitsch int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
96d71ae5a4SJacob Faibussowitsch {
97d23021a0SBarry Smith   std::cout << "Jacobi Example" << std::endl;
98d23021a0SBarry Smith 
99d23021a0SBarry Smith   /*
100d23021a0SBarry Smith    * ----[Solver Parameters]------------
101d23021a0SBarry Smith    * tol       - Method terminates once the norm is less than tol
102d23021a0SBarry Smith    * N         - Number of unknown gridpoints per cartesian dimension
103d23021a0SBarry Smith    * NN        - Total number of gridpoints on the grid
104d23021a0SBarry Smith    * maxIter   - Maximum number of iterations to be taken
105d23021a0SBarry Smith    *
106d23021a0SBarry Smith    * resI2     - Residual
107d23021a0SBarry Smith    * iteration - Iteration number
108d23021a0SBarry Smith    * grid_s    - Struct with grid information for a cartesian dimension
109d23021a0SBarry Smith   */
110d23021a0SBarry Smith   double tol = 1e-10;
111d23021a0SBarry Smith 
112d23021a0SBarry Smith   int N       = 50;
113d23021a0SBarry Smith   int NN      = (N + 2) * (N + 2);
114d23021a0SBarry Smith   int maxIter = 100000;
115d23021a0SBarry Smith 
116d23021a0SBarry Smith   double resI2;
117d23021a0SBarry Smith   int    iteration;
118d23021a0SBarry Smith 
119d23021a0SBarry Smith   grid_s gridx;
120d23021a0SBarry Smith   gridx.o = 0.0;
121d23021a0SBarry Smith   gridx.h = 1.0 / (N + 1.0);
122d23021a0SBarry Smith   gridx.n = N + 2;
123d23021a0SBarry Smith 
124d23021a0SBarry Smith   //
125d23021a0SBarry Smith   //I, Iold - Holds iterates of Jacobi method
126d23021a0SBarry Smith   //
127d23021a0SBarry Smith   double *I    = memoryManager::allocate<double>(NN);
128d23021a0SBarry Smith   double *Iold = memoryManager::allocate<double>(NN);
129d23021a0SBarry Smith 
130d23021a0SBarry Smith   memset(I, 0, NN * sizeof(double));
131d23021a0SBarry Smith   memset(Iold, 0, NN * sizeof(double));
132d23021a0SBarry Smith 
133d23021a0SBarry Smith   printf("Standard  C++ Loop \n");
134d23021a0SBarry Smith   resI2     = 1;
135d23021a0SBarry Smith   iteration = 0;
136d23021a0SBarry Smith 
137d23021a0SBarry Smith   while (resI2 > tol * tol) {
138d23021a0SBarry Smith     //
139d23021a0SBarry Smith     // Jacobi Iteration
140d23021a0SBarry Smith     //
141d23021a0SBarry Smith     for (int n = 1; n <= N; ++n) {
142d23021a0SBarry Smith       for (int m = 1; m <= N; ++m) {
143d23021a0SBarry Smith         double x = gridx.o + m * gridx.h;
144d23021a0SBarry Smith         double y = gridx.o + n * gridx.h;
145d23021a0SBarry Smith 
1469371c9d4SSatish Balay         double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
147d23021a0SBarry Smith 
148d23021a0SBarry Smith         int id = n * (N + 2) + m;
1499371c9d4SSatish Balay         I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
150d23021a0SBarry Smith       }
151d23021a0SBarry Smith     }
152d23021a0SBarry Smith 
153d23021a0SBarry Smith     //
154d23021a0SBarry Smith     // Compute residual and update Iold
155d23021a0SBarry Smith     //
156d23021a0SBarry Smith     resI2 = 0.0;
157d23021a0SBarry Smith     for (int k = 0; k < NN; k++) {
158d23021a0SBarry Smith       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
159d23021a0SBarry Smith       Iold[k] = I[k];
160d23021a0SBarry Smith     }
161d23021a0SBarry Smith 
162d23021a0SBarry Smith     if (iteration > maxIter) {
163d23021a0SBarry Smith       printf("Standard C++ Loop - Maxed out on iterations \n");
164d23021a0SBarry Smith       exit(-1);
165d23021a0SBarry Smith     }
166d23021a0SBarry Smith 
167d23021a0SBarry Smith     iteration++;
168d23021a0SBarry Smith   }
169d23021a0SBarry Smith   computeErr(I, gridx);
170d23021a0SBarry Smith   printf("No of iterations: %d \n \n", iteration);
171d23021a0SBarry Smith 
172d23021a0SBarry Smith   //
173d23021a0SBarry Smith   // RAJA loop calls may be shortened by predefining policies
174d23021a0SBarry Smith   //
175d23021a0SBarry Smith   RAJA::RangeSegment gridRange(0, NN);
17657508eceSPierre Jolivet   RAJA::RangeSegment jacobiRange(1, N + 1);
177d23021a0SBarry Smith 
1789371c9d4SSatish Balay   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
179d23021a0SBarry Smith 
180d23021a0SBarry Smith   printf("RAJA: Sequential Policy - Nested ForallN \n");
181d23021a0SBarry Smith   resI2     = 1;
182d23021a0SBarry Smith   iteration = 0;
183d23021a0SBarry Smith   memset(I, 0, NN * sizeof(double));
184d23021a0SBarry Smith   memset(Iold, 0, NN * sizeof(double));
185d23021a0SBarry Smith 
186d23021a0SBarry Smith   /*
187d23021a0SBarry Smith    *  Sequential Jacobi Iteration.
188d23021a0SBarry Smith    *
189d23021a0SBarry Smith    *  Note that a RAJA ReduceSum object is used to accumulate the sum
190d23021a0SBarry Smith    *  for the residual. Since the loop is run sequentially, this is
191d23021a0SBarry Smith    *  not strictly necessary. It is done here for consistency and
192d23021a0SBarry Smith    *  comparison with other RAJA variants in this example.
193d23021a0SBarry Smith    */
194d23021a0SBarry Smith   while (resI2 > tol * tol) {
1959371c9d4SSatish Balay     RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
196d23021a0SBarry Smith       double x = gridx.o + m * gridx.h;
197d23021a0SBarry Smith       double y = gridx.o + n * gridx.h;
198d23021a0SBarry Smith 
1999371c9d4SSatish Balay       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
200d23021a0SBarry Smith 
201d23021a0SBarry Smith       int id = n * (N + 2) + m;
2029371c9d4SSatish Balay       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
203d23021a0SBarry Smith     });
204d23021a0SBarry Smith 
205d23021a0SBarry Smith     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
2069371c9d4SSatish Balay     RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
207d23021a0SBarry Smith       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
208d23021a0SBarry Smith       Iold[k] = I[k];
209d23021a0SBarry Smith     });
210d23021a0SBarry Smith 
211d23021a0SBarry Smith     resI2 = RAJA_resI2;
212d23021a0SBarry Smith     if (iteration > maxIter) {
213d23021a0SBarry Smith       printf("Jacobi: Sequential - Maxed out on iterations! \n");
214d23021a0SBarry Smith       exit(-1);
215d23021a0SBarry Smith     }
216d23021a0SBarry Smith     iteration++;
217d23021a0SBarry Smith   }
218d23021a0SBarry Smith   computeErr(I, gridx);
219d23021a0SBarry Smith   printf("No of iterations: %d \n \n", iteration);
220d23021a0SBarry Smith 
221d23021a0SBarry Smith #if defined(RAJA_ENABLE_OPENMP)
222d23021a0SBarry Smith   printf("RAJA: OpenMP Policy - Nested ForallN \n");
223d23021a0SBarry Smith   resI2     = 1;
224d23021a0SBarry Smith   iteration = 0;
225d23021a0SBarry Smith   memset(I, 0, NN * sizeof(double));
226d23021a0SBarry Smith   memset(Iold, 0, NN * sizeof(double));
227d23021a0SBarry Smith 
228d23021a0SBarry Smith   /*
229d23021a0SBarry Smith    *  OpenMP parallel Jacobi Iteration.
230d23021a0SBarry Smith    *
231d23021a0SBarry Smith    *  ----[RAJA Policies]-----------
232d23021a0SBarry Smith    *  RAJA::omp_collapse_for_exec -
233d23021a0SBarry Smith    *  introduced a nested region
234d23021a0SBarry Smith    *
235d23021a0SBarry Smith    *  Note that OpenMP RAJA ReduceSum object performs the reduction
236d23021a0SBarry Smith    *  operation for the residual in a thread-safe manner.
237d23021a0SBarry Smith    */
238d23021a0SBarry Smith 
2399371c9d4SSatish Balay   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
240d23021a0SBarry Smith 
241d23021a0SBarry Smith   while (resI2 > tol * tol) {
2429371c9d4SSatish Balay     RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
243d23021a0SBarry Smith       double x = gridx.o + m * gridx.h;
244d23021a0SBarry Smith       double y = gridx.o + n * gridx.h;
245d23021a0SBarry Smith 
2469371c9d4SSatish Balay       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
247d23021a0SBarry Smith 
248d23021a0SBarry Smith       int id = n * (N + 2) + m;
2499371c9d4SSatish Balay       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
250d23021a0SBarry Smith     });
251d23021a0SBarry Smith 
252d23021a0SBarry Smith     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
253d23021a0SBarry Smith 
2549371c9d4SSatish Balay     RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
255d23021a0SBarry Smith       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
256d23021a0SBarry Smith       Iold[k] = I[k];
257d23021a0SBarry Smith     });
258d23021a0SBarry Smith 
259d23021a0SBarry Smith     resI2 = RAJA_resI2;
260d23021a0SBarry Smith     if (iteration > maxIter) {
261d23021a0SBarry Smith       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
262d23021a0SBarry Smith       exit(-1);
263d23021a0SBarry Smith     }
264d23021a0SBarry Smith     iteration++;
265d23021a0SBarry Smith   }
266d23021a0SBarry Smith   computeErr(I, gridx);
267d23021a0SBarry Smith   printf("No of iterations: %d \n \n", iteration);
268d23021a0SBarry Smith #endif
269d23021a0SBarry Smith 
270d23021a0SBarry Smith #if defined(RAJA_ENABLE_CUDA)
271d23021a0SBarry Smith   /*
272d23021a0SBarry Smith    *  CUDA Jacobi Iteration.
273d23021a0SBarry Smith    *
274d23021a0SBarry Smith    *  ----[RAJA Policies]-----------
275d23021a0SBarry Smith    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
276d23021a0SBarry Smith    *  define the mapping of loop iterations to GPU thread blocks
277d23021a0SBarry Smith    *
278d23021a0SBarry Smith    *  Note that CUDA RAJA ReduceSum object performs the reduction
279d23021a0SBarry Smith    *  operation for the residual in a thread-safe manner on the GPU.
280d23021a0SBarry Smith    */
281d23021a0SBarry Smith 
282d23021a0SBarry Smith   printf("RAJA: CUDA Policy - Nested ForallN \n");
283d23021a0SBarry Smith 
2849371c9d4SSatish Balay   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
285d23021a0SBarry Smith 
286d23021a0SBarry Smith   resI2     = 1;
287d23021a0SBarry Smith   iteration = 0;
288d23021a0SBarry Smith   memset(I, 0, NN * sizeof(double));
289d23021a0SBarry Smith   memset(Iold, 0, NN * sizeof(double));
290d23021a0SBarry Smith 
291d23021a0SBarry Smith   while (resI2 > tol * tol) {
292d23021a0SBarry Smith     //
293d23021a0SBarry Smith     // Jacobi Iteration
294d23021a0SBarry Smith     //
2959371c9d4SSatish Balay     RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
296d23021a0SBarry Smith       double x = gridx.o + m * gridx.h;
297d23021a0SBarry Smith       double y = gridx.o + n * gridx.h;
298d23021a0SBarry Smith 
2999371c9d4SSatish Balay       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
300d23021a0SBarry Smith 
301d23021a0SBarry Smith       int id = n * (N + 2) + m;
3029371c9d4SSatish Balay       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
303d23021a0SBarry Smith     });
304d23021a0SBarry Smith 
305d23021a0SBarry Smith     //
306d23021a0SBarry Smith     // Compute residual and update Iold
307d23021a0SBarry Smith     //
308d23021a0SBarry Smith     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
3099371c9d4SSatish Balay     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
310d23021a0SBarry Smith       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
311d23021a0SBarry Smith       Iold[k] = I[k];
312d23021a0SBarry Smith     });
313d23021a0SBarry Smith 
314d23021a0SBarry Smith     resI2 = RAJA_resI2;
315d23021a0SBarry Smith 
316d23021a0SBarry Smith     if (iteration > maxIter) {
317d23021a0SBarry Smith       printf("RAJA: CUDA - Maxed out on iterations! \n");
318d23021a0SBarry Smith       exit(-1);
319d23021a0SBarry Smith     }
320d23021a0SBarry Smith     iteration++;
321d23021a0SBarry Smith   }
322d23021a0SBarry Smith   cudaDeviceSynchronize();
323d23021a0SBarry Smith   computeErr(I, gridx);
324d23021a0SBarry Smith   printf("No of iterations: %d \n \n", iteration);
325d23021a0SBarry Smith #endif
326d23021a0SBarry Smith 
327d23021a0SBarry Smith #if defined(RAJA_ENABLE_HIP)
328d23021a0SBarry Smith   /*
329d23021a0SBarry Smith    *  HIP Jacobi Iteration.
330d23021a0SBarry Smith    *
331d23021a0SBarry Smith    *  ----[RAJA Policies]-----------
332d23021a0SBarry Smith    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
333d23021a0SBarry Smith    *  define the mapping of loop iterations to GPU thread blocks
334d23021a0SBarry Smith    *
335d23021a0SBarry Smith    *  Note that HIP RAJA ReduceSum object performs the reduction
336d23021a0SBarry Smith    *  operation for the residual in a thread-safe manner on the GPU.
337d23021a0SBarry Smith    */
338d23021a0SBarry Smith 
339d23021a0SBarry Smith   printf("RAJA: HIP Policy - Nested ForallN \n");
340d23021a0SBarry Smith 
3419371c9d4SSatish Balay   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
342d23021a0SBarry Smith 
343d23021a0SBarry Smith   resI2     = 1;
344d23021a0SBarry Smith   iteration = 0;
345d23021a0SBarry Smith   memset(I, 0, NN * sizeof(double));
346d23021a0SBarry Smith   memset(Iold, 0, NN * sizeof(double));
347d23021a0SBarry Smith 
348d23021a0SBarry Smith   double *d_I    = memoryManager::allocate_gpu<double>(NN);
349d23021a0SBarry Smith   double *d_Iold = memoryManager::allocate_gpu<double>(NN);
350d23021a0SBarry Smith   hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
351d23021a0SBarry Smith   hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
352d23021a0SBarry Smith 
353d23021a0SBarry Smith   while (resI2 > tol * tol) {
354d23021a0SBarry Smith     //
355d23021a0SBarry Smith     // Jacobi Iteration
356d23021a0SBarry Smith     //
3579371c9d4SSatish Balay     RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
358d23021a0SBarry Smith       double x = gridx.o + m * gridx.h;
359d23021a0SBarry Smith       double y = gridx.o + n * gridx.h;
360d23021a0SBarry Smith 
3619371c9d4SSatish Balay       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
362d23021a0SBarry Smith 
363d23021a0SBarry Smith       int id  = n * (N + 2) + m;
3649371c9d4SSatish Balay       d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
365d23021a0SBarry Smith     });
366d23021a0SBarry Smith 
367d23021a0SBarry Smith     //
368d23021a0SBarry Smith     // Compute residual and update Iold
369d23021a0SBarry Smith     //
370d23021a0SBarry Smith     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
3719371c9d4SSatish Balay     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
372d23021a0SBarry Smith       RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
373d23021a0SBarry Smith       d_Iold[k] = d_I[k];
374d23021a0SBarry Smith     });
375d23021a0SBarry Smith 
376d23021a0SBarry Smith     resI2 = RAJA_resI2;
377d23021a0SBarry Smith 
378d23021a0SBarry Smith     if (iteration > maxIter) {
379d23021a0SBarry Smith       printf("RAJA: HIP - Maxed out on iterations! \n");
380d23021a0SBarry Smith       exit(-1);
381d23021a0SBarry Smith     }
382d23021a0SBarry Smith     iteration++;
383d23021a0SBarry Smith   }
384d23021a0SBarry Smith   hipDeviceSynchronize();
385d23021a0SBarry Smith   hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
386d23021a0SBarry Smith   computeErr(I, gridx);
387d23021a0SBarry Smith   printf("No of iterations: %d \n \n", iteration);
388d23021a0SBarry Smith 
389d23021a0SBarry Smith   memoryManager::deallocate_gpu(d_I);
390d23021a0SBarry Smith   memoryManager::deallocate_gpu(d_Iold);
391d23021a0SBarry Smith #endif
392d23021a0SBarry Smith 
393d23021a0SBarry Smith   memoryManager::deallocate(I);
394d23021a0SBarry Smith   memoryManager::deallocate(Iold);
395d23021a0SBarry Smith 
396d23021a0SBarry Smith   return 0;
397d23021a0SBarry Smith }
398d23021a0SBarry Smith 
399d23021a0SBarry Smith //
400145b44c9SPierre Jolivet // Function for the analytic solution
401d23021a0SBarry Smith //
402d71ae5a4SJacob Faibussowitsch double solution(double x, double y)
403d71ae5a4SJacob Faibussowitsch {
404d23021a0SBarry Smith   return x * y * exp(x - y) * (1 - x) * (1 - y);
405d23021a0SBarry Smith }
406d23021a0SBarry Smith 
407d23021a0SBarry Smith //
408d23021a0SBarry Smith // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
409d23021a0SBarry Smith //
410d71ae5a4SJacob Faibussowitsch void computeErr(double *I, grid_s grid)
411d71ae5a4SJacob Faibussowitsch {
412d23021a0SBarry Smith   RAJA::RangeSegment                        gridRange(0, grid.n);
413d23021a0SBarry Smith   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
414d23021a0SBarry Smith 
4159371c9d4SSatish Balay   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
416d23021a0SBarry Smith 
4179371c9d4SSatish Balay   RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
418d23021a0SBarry Smith     int    id    = tx + grid.n * ty;
419d23021a0SBarry Smith     double x     = grid.o + tx * grid.h;
420d23021a0SBarry Smith     double y     = grid.o + ty * grid.h;
421d23021a0SBarry Smith     double myErr = std::abs(I[id] - solution(x, y));
422d23021a0SBarry Smith     tMax.max(myErr);
423d23021a0SBarry Smith   });
424d23021a0SBarry Smith 
425d23021a0SBarry Smith   double l2err = tMax;
426d23021a0SBarry Smith   printf("Max error = %lg, h = %f \n", l2err, grid.h);
427d23021a0SBarry Smith }
428d23021a0SBarry Smith 
429d23021a0SBarry Smith /*TEST
430d23021a0SBarry Smith 
431d23021a0SBarry Smith     test:
432d23021a0SBarry Smith       requires: raja !cuda
433*d2b7c016SPierre Jolivet       filter: sed -e "/RAJA: OpenMP Policy/,+3d"
434d23021a0SBarry Smith 
435d23021a0SBarry Smith     test:
436d23021a0SBarry Smith       suffix: 2
437d23021a0SBarry Smith       requires: raja cuda
438*d2b7c016SPierre Jolivet       filter: sed -e "/RAJA: OpenMP Policy/,+3d"
439d23021a0SBarry Smith 
440d23021a0SBarry Smith TEST*/
441