1d23021a0SBarry Smith //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 2d23021a0SBarry Smith // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC 3d23021a0SBarry Smith // and RAJA project contributors. See the RAJA/COPYRIGHT file for details. 4d23021a0SBarry Smith // 5d23021a0SBarry Smith // SPDX-License-Identifier: (BSD-3-Clause) 6d23021a0SBarry Smith //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 7d23021a0SBarry Smith 8d23021a0SBarry Smith #include <cstdlib> 9d23021a0SBarry Smith #include <cstdio> 10d23021a0SBarry Smith #include <cstring> 11d23021a0SBarry Smith 12d23021a0SBarry Smith #include <iostream> 13d23021a0SBarry Smith #include <cmath> 14d23021a0SBarry Smith 15d23021a0SBarry Smith #include "RAJA/RAJA.hpp" 16d23021a0SBarry Smith 17d23021a0SBarry Smith #include "memoryManager.hpp" 18d23021a0SBarry Smith 19d23021a0SBarry Smith /* 20d23021a0SBarry Smith * Jacobi Example 21d23021a0SBarry Smith * 22d23021a0SBarry Smith * ----[Details]-------------------- 23d23021a0SBarry Smith * This code uses a five point finite difference stencil 24d23021a0SBarry Smith * to discretize the following boundary value problem 25d23021a0SBarry Smith * 26d23021a0SBarry Smith * U_xx + U_yy = f on [0,1] x [0,1]. 27d23021a0SBarry Smith * 28d23021a0SBarry Smith * The right-hand side is chosen to be 29d23021a0SBarry Smith * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y). 30d23021a0SBarry Smith * 31d23021a0SBarry Smith * A structured grid is used to discretize the domain 32d23021a0SBarry Smith * [0,1] x [0,1]. Values inside the domain are computed 33d23021a0SBarry Smith * using the Jacobi method to solve the associated 34d23021a0SBarry Smith * linear system. The scheme is invoked until the l_2 35d23021a0SBarry Smith * difference of subsequent iterations is below a 36d23021a0SBarry Smith * tolerance. 37d23021a0SBarry Smith * 38d23021a0SBarry Smith * The scheme is implemented by allocating two arrays 39d23021a0SBarry Smith * (I, Iold) and initialized to zero. The first set of 40d23021a0SBarry Smith * nested for loops apply an iteration of the Jacobi 41d23021a0SBarry Smith * scheme. The scheme is only applied to the interior 42d23021a0SBarry Smith * nodes. 43d23021a0SBarry Smith * 44d23021a0SBarry Smith * The second set of nested for loops is used to 45d23021a0SBarry Smith * update Iold and compute the l_2 norm of the 46d23021a0SBarry Smith * difference of the iterates. 47d23021a0SBarry Smith * 48d23021a0SBarry Smith * Computing the l_2 norm requires a reduction operation. 49d23021a0SBarry Smith * To simplify the reduction procedure, the RAJA API 50d23021a0SBarry Smith * introduces thread safe variables. 51d23021a0SBarry Smith * 52d23021a0SBarry Smith * ----[RAJA Concepts]--------------- 53d23021a0SBarry Smith * - Forall::nested loop 54d23021a0SBarry Smith * - RAJA Reduction 55d23021a0SBarry Smith * 56d23021a0SBarry Smith */ 57d23021a0SBarry Smith 58d23021a0SBarry Smith /* 59d23021a0SBarry Smith * ----[Constant Values]----- 60d23021a0SBarry Smith * CUDA_BLOCK_SIZE_X - Number of threads in the 61d23021a0SBarry Smith * x-dimension of a cuda thread block 62d23021a0SBarry Smith * 63d23021a0SBarry Smith * CUDA_BLOCK_SIZE_Y - Number of threads in the 64d23021a0SBarry Smith * y-dimension of a cuda thread block 65d23021a0SBarry Smith * 66d23021a0SBarry Smith * CUDA_BLOCK_SIZE - Number of threads per threads block 67d23021a0SBarry Smith */ 68d23021a0SBarry Smith #if defined(RAJA_ENABLE_CUDA) 69d23021a0SBarry Smith const int CUDA_BLOCK_SIZE = 256; 70d23021a0SBarry Smith #endif 71d23021a0SBarry Smith 72d23021a0SBarry Smith #if defined(RAJA_ENABLE_HIP) 73d23021a0SBarry Smith const int HIP_BLOCK_SIZE = 256; 74d23021a0SBarry Smith #endif 75d23021a0SBarry Smith 76d23021a0SBarry Smith // 77d23021a0SBarry Smith // Struct to hold grid info 78d23021a0SBarry Smith // o - Origin in a cartesian dimension 79d23021a0SBarry Smith // h - Spacing between grid points 80d23021a0SBarry Smith // n - Number of grid points 81d23021a0SBarry Smith // 82d23021a0SBarry Smith struct grid_s { 83d23021a0SBarry Smith double o, h; 84d23021a0SBarry Smith int n; 85d23021a0SBarry Smith }; 86d23021a0SBarry Smith 87d23021a0SBarry Smith // 88d23021a0SBarry Smith // ----[Functions]--------- 89d23021a0SBarry Smith // solution - Function for the analytic solution 90d23021a0SBarry Smith // computeErr - Displays the maximum error in the solution 91d23021a0SBarry Smith // 92d23021a0SBarry Smith double solution(double x, double y); 93d23021a0SBarry Smith void computeErr(double *I, grid_s grid); 94d23021a0SBarry Smith 95d71ae5a4SJacob Faibussowitsch int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) 96d71ae5a4SJacob Faibussowitsch { 97d23021a0SBarry Smith std::cout << "Jacobi Example" << std::endl; 98d23021a0SBarry Smith 99d23021a0SBarry Smith /* 100d23021a0SBarry Smith * ----[Solver Parameters]------------ 101d23021a0SBarry Smith * tol - Method terminates once the norm is less than tol 102d23021a0SBarry Smith * N - Number of unknown gridpoints per cartesian dimension 103d23021a0SBarry Smith * NN - Total number of gridpoints on the grid 104d23021a0SBarry Smith * maxIter - Maximum number of iterations to be taken 105d23021a0SBarry Smith * 106d23021a0SBarry Smith * resI2 - Residual 107d23021a0SBarry Smith * iteration - Iteration number 108d23021a0SBarry Smith * grid_s - Struct with grid information for a cartesian dimension 109d23021a0SBarry Smith */ 110d23021a0SBarry Smith double tol = 1e-10; 111d23021a0SBarry Smith 112d23021a0SBarry Smith int N = 50; 113d23021a0SBarry Smith int NN = (N + 2) * (N + 2); 114d23021a0SBarry Smith int maxIter = 100000; 115d23021a0SBarry Smith 116d23021a0SBarry Smith double resI2; 117d23021a0SBarry Smith int iteration; 118d23021a0SBarry Smith 119d23021a0SBarry Smith grid_s gridx; 120d23021a0SBarry Smith gridx.o = 0.0; 121d23021a0SBarry Smith gridx.h = 1.0 / (N + 1.0); 122d23021a0SBarry Smith gridx.n = N + 2; 123d23021a0SBarry Smith 124d23021a0SBarry Smith // 125d23021a0SBarry Smith //I, Iold - Holds iterates of Jacobi method 126d23021a0SBarry Smith // 127d23021a0SBarry Smith double *I = memoryManager::allocate<double>(NN); 128d23021a0SBarry Smith double *Iold = memoryManager::allocate<double>(NN); 129d23021a0SBarry Smith 130d23021a0SBarry Smith memset(I, 0, NN * sizeof(double)); 131d23021a0SBarry Smith memset(Iold, 0, NN * sizeof(double)); 132d23021a0SBarry Smith 133d23021a0SBarry Smith printf("Standard C++ Loop \n"); 134d23021a0SBarry Smith resI2 = 1; 135d23021a0SBarry Smith iteration = 0; 136d23021a0SBarry Smith 137d23021a0SBarry Smith while (resI2 > tol * tol) { 138d23021a0SBarry Smith // 139d23021a0SBarry Smith // Jacobi Iteration 140d23021a0SBarry Smith // 141d23021a0SBarry Smith for (int n = 1; n <= N; ++n) { 142d23021a0SBarry Smith for (int m = 1; m <= N; ++m) { 143d23021a0SBarry Smith double x = gridx.o + m * gridx.h; 144d23021a0SBarry Smith double y = gridx.o + n * gridx.h; 145d23021a0SBarry Smith 1469371c9d4SSatish Balay double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 147d23021a0SBarry Smith 148d23021a0SBarry Smith int id = n * (N + 2) + m; 1499371c9d4SSatish Balay I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 150d23021a0SBarry Smith } 151d23021a0SBarry Smith } 152d23021a0SBarry Smith 153d23021a0SBarry Smith // 154d23021a0SBarry Smith // Compute residual and update Iold 155d23021a0SBarry Smith // 156d23021a0SBarry Smith resI2 = 0.0; 157d23021a0SBarry Smith for (int k = 0; k < NN; k++) { 158d23021a0SBarry Smith resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 159d23021a0SBarry Smith Iold[k] = I[k]; 160d23021a0SBarry Smith } 161d23021a0SBarry Smith 162d23021a0SBarry Smith if (iteration > maxIter) { 163d23021a0SBarry Smith printf("Standard C++ Loop - Maxed out on iterations \n"); 164d23021a0SBarry Smith exit(-1); 165d23021a0SBarry Smith } 166d23021a0SBarry Smith 167d23021a0SBarry Smith iteration++; 168d23021a0SBarry Smith } 169d23021a0SBarry Smith computeErr(I, gridx); 170d23021a0SBarry Smith printf("No of iterations: %d \n \n", iteration); 171d23021a0SBarry Smith 172d23021a0SBarry Smith // 173d23021a0SBarry Smith // RAJA loop calls may be shortened by predefining policies 174d23021a0SBarry Smith // 175d23021a0SBarry Smith RAJA::RangeSegment gridRange(0, NN); 17657508eceSPierre Jolivet RAJA::RangeSegment jacobiRange(1, N + 1); 177d23021a0SBarry Smith 1789371c9d4SSatish Balay using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 179d23021a0SBarry Smith 180d23021a0SBarry Smith printf("RAJA: Sequential Policy - Nested ForallN \n"); 181d23021a0SBarry Smith resI2 = 1; 182d23021a0SBarry Smith iteration = 0; 183d23021a0SBarry Smith memset(I, 0, NN * sizeof(double)); 184d23021a0SBarry Smith memset(Iold, 0, NN * sizeof(double)); 185d23021a0SBarry Smith 186d23021a0SBarry Smith /* 187d23021a0SBarry Smith * Sequential Jacobi Iteration. 188d23021a0SBarry Smith * 189d23021a0SBarry Smith * Note that a RAJA ReduceSum object is used to accumulate the sum 190d23021a0SBarry Smith * for the residual. Since the loop is run sequentially, this is 191d23021a0SBarry Smith * not strictly necessary. It is done here for consistency and 192d23021a0SBarry Smith * comparison with other RAJA variants in this example. 193d23021a0SBarry Smith */ 194d23021a0SBarry Smith while (resI2 > tol * tol) { 1959371c9d4SSatish Balay RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) { 196d23021a0SBarry Smith double x = gridx.o + m * gridx.h; 197d23021a0SBarry Smith double y = gridx.o + n * gridx.h; 198d23021a0SBarry Smith 1999371c9d4SSatish Balay double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 200d23021a0SBarry Smith 201d23021a0SBarry Smith int id = n * (N + 2) + m; 2029371c9d4SSatish Balay I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 203d23021a0SBarry Smith }); 204d23021a0SBarry Smith 205d23021a0SBarry Smith RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0); 2069371c9d4SSatish Balay RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) { 207d23021a0SBarry Smith RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 208d23021a0SBarry Smith Iold[k] = I[k]; 209d23021a0SBarry Smith }); 210d23021a0SBarry Smith 211d23021a0SBarry Smith resI2 = RAJA_resI2; 212d23021a0SBarry Smith if (iteration > maxIter) { 213d23021a0SBarry Smith printf("Jacobi: Sequential - Maxed out on iterations! \n"); 214d23021a0SBarry Smith exit(-1); 215d23021a0SBarry Smith } 216d23021a0SBarry Smith iteration++; 217d23021a0SBarry Smith } 218d23021a0SBarry Smith computeErr(I, gridx); 219d23021a0SBarry Smith printf("No of iterations: %d \n \n", iteration); 220d23021a0SBarry Smith 221d23021a0SBarry Smith #if defined(RAJA_ENABLE_OPENMP) 222d23021a0SBarry Smith printf("RAJA: OpenMP Policy - Nested ForallN \n"); 223d23021a0SBarry Smith resI2 = 1; 224d23021a0SBarry Smith iteration = 0; 225d23021a0SBarry Smith memset(I, 0, NN * sizeof(double)); 226d23021a0SBarry Smith memset(Iold, 0, NN * sizeof(double)); 227d23021a0SBarry Smith 228d23021a0SBarry Smith /* 229d23021a0SBarry Smith * OpenMP parallel Jacobi Iteration. 230d23021a0SBarry Smith * 231d23021a0SBarry Smith * ----[RAJA Policies]----------- 232d23021a0SBarry Smith * RAJA::omp_collapse_for_exec - 233d23021a0SBarry Smith * introduced a nested region 234d23021a0SBarry Smith * 235d23021a0SBarry Smith * Note that OpenMP RAJA ReduceSum object performs the reduction 236d23021a0SBarry Smith * operation for the residual in a thread-safe manner. 237d23021a0SBarry Smith */ 238d23021a0SBarry Smith 2399371c9d4SSatish Balay using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 240d23021a0SBarry Smith 241d23021a0SBarry Smith while (resI2 > tol * tol) { 2429371c9d4SSatish Balay RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) { 243d23021a0SBarry Smith double x = gridx.o + m * gridx.h; 244d23021a0SBarry Smith double y = gridx.o + n * gridx.h; 245d23021a0SBarry Smith 2469371c9d4SSatish Balay double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 247d23021a0SBarry Smith 248d23021a0SBarry Smith int id = n * (N + 2) + m; 2499371c9d4SSatish Balay I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 250d23021a0SBarry Smith }); 251d23021a0SBarry Smith 252d23021a0SBarry Smith RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0); 253d23021a0SBarry Smith 2549371c9d4SSatish Balay RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) { 255d23021a0SBarry Smith RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 256d23021a0SBarry Smith Iold[k] = I[k]; 257d23021a0SBarry Smith }); 258d23021a0SBarry Smith 259d23021a0SBarry Smith resI2 = RAJA_resI2; 260d23021a0SBarry Smith if (iteration > maxIter) { 261d23021a0SBarry Smith printf("Jacobi: OpenMP - Maxed out on iterations! \n"); 262d23021a0SBarry Smith exit(-1); 263d23021a0SBarry Smith } 264d23021a0SBarry Smith iteration++; 265d23021a0SBarry Smith } 266d23021a0SBarry Smith computeErr(I, gridx); 267d23021a0SBarry Smith printf("No of iterations: %d \n \n", iteration); 268d23021a0SBarry Smith #endif 269d23021a0SBarry Smith 270d23021a0SBarry Smith #if defined(RAJA_ENABLE_CUDA) 271d23021a0SBarry Smith /* 272d23021a0SBarry Smith * CUDA Jacobi Iteration. 273d23021a0SBarry Smith * 274d23021a0SBarry Smith * ----[RAJA Policies]----------- 275d23021a0SBarry Smith * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 276d23021a0SBarry Smith * define the mapping of loop iterations to GPU thread blocks 277d23021a0SBarry Smith * 278d23021a0SBarry Smith * Note that CUDA RAJA ReduceSum object performs the reduction 279d23021a0SBarry Smith * operation for the residual in a thread-safe manner on the GPU. 280d23021a0SBarry Smith */ 281d23021a0SBarry Smith 282d23021a0SBarry Smith printf("RAJA: CUDA Policy - Nested ForallN \n"); 283d23021a0SBarry Smith 2849371c9d4SSatish Balay using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>; 285d23021a0SBarry Smith 286d23021a0SBarry Smith resI2 = 1; 287d23021a0SBarry Smith iteration = 0; 288d23021a0SBarry Smith memset(I, 0, NN * sizeof(double)); 289d23021a0SBarry Smith memset(Iold, 0, NN * sizeof(double)); 290d23021a0SBarry Smith 291d23021a0SBarry Smith while (resI2 > tol * tol) { 292d23021a0SBarry Smith // 293d23021a0SBarry Smith // Jacobi Iteration 294d23021a0SBarry Smith // 2959371c9d4SSatish Balay RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { 296d23021a0SBarry Smith double x = gridx.o + m * gridx.h; 297d23021a0SBarry Smith double y = gridx.o + n * gridx.h; 298d23021a0SBarry Smith 2999371c9d4SSatish Balay double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 300d23021a0SBarry Smith 301d23021a0SBarry Smith int id = n * (N + 2) + m; 3029371c9d4SSatish Balay I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 303d23021a0SBarry Smith }); 304d23021a0SBarry Smith 305d23021a0SBarry Smith // 306d23021a0SBarry Smith // Compute residual and update Iold 307d23021a0SBarry Smith // 308d23021a0SBarry Smith RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0); 3099371c9d4SSatish Balay RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { 310d23021a0SBarry Smith RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 311d23021a0SBarry Smith Iold[k] = I[k]; 312d23021a0SBarry Smith }); 313d23021a0SBarry Smith 314d23021a0SBarry Smith resI2 = RAJA_resI2; 315d23021a0SBarry Smith 316d23021a0SBarry Smith if (iteration > maxIter) { 317d23021a0SBarry Smith printf("RAJA: CUDA - Maxed out on iterations! \n"); 318d23021a0SBarry Smith exit(-1); 319d23021a0SBarry Smith } 320d23021a0SBarry Smith iteration++; 321d23021a0SBarry Smith } 322d23021a0SBarry Smith cudaDeviceSynchronize(); 323d23021a0SBarry Smith computeErr(I, gridx); 324d23021a0SBarry Smith printf("No of iterations: %d \n \n", iteration); 325d23021a0SBarry Smith #endif 326d23021a0SBarry Smith 327d23021a0SBarry Smith #if defined(RAJA_ENABLE_HIP) 328d23021a0SBarry Smith /* 329d23021a0SBarry Smith * HIP Jacobi Iteration. 330d23021a0SBarry Smith * 331d23021a0SBarry Smith * ----[RAJA Policies]----------- 332d23021a0SBarry Smith * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 333d23021a0SBarry Smith * define the mapping of loop iterations to GPU thread blocks 334d23021a0SBarry Smith * 335d23021a0SBarry Smith * Note that HIP RAJA ReduceSum object performs the reduction 336d23021a0SBarry Smith * operation for the residual in a thread-safe manner on the GPU. 337d23021a0SBarry Smith */ 338d23021a0SBarry Smith 339d23021a0SBarry Smith printf("RAJA: HIP Policy - Nested ForallN \n"); 340d23021a0SBarry Smith 3419371c9d4SSatish Balay using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>; 342d23021a0SBarry Smith 343d23021a0SBarry Smith resI2 = 1; 344d23021a0SBarry Smith iteration = 0; 345d23021a0SBarry Smith memset(I, 0, NN * sizeof(double)); 346d23021a0SBarry Smith memset(Iold, 0, NN * sizeof(double)); 347d23021a0SBarry Smith 348d23021a0SBarry Smith double *d_I = memoryManager::allocate_gpu<double>(NN); 349d23021a0SBarry Smith double *d_Iold = memoryManager::allocate_gpu<double>(NN); 350d23021a0SBarry Smith hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice)); 351d23021a0SBarry Smith hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice)); 352d23021a0SBarry Smith 353d23021a0SBarry Smith while (resI2 > tol * tol) { 354d23021a0SBarry Smith // 355d23021a0SBarry Smith // Jacobi Iteration 356d23021a0SBarry Smith // 3579371c9d4SSatish Balay RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { 358d23021a0SBarry Smith double x = gridx.o + m * gridx.h; 359d23021a0SBarry Smith double y = gridx.o + n * gridx.h; 360d23021a0SBarry Smith 3619371c9d4SSatish Balay double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 362d23021a0SBarry Smith 363d23021a0SBarry Smith int id = n * (N + 2) + m; 3649371c9d4SSatish Balay d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]); 365d23021a0SBarry Smith }); 366d23021a0SBarry Smith 367d23021a0SBarry Smith // 368d23021a0SBarry Smith // Compute residual and update Iold 369d23021a0SBarry Smith // 370d23021a0SBarry Smith RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0); 3719371c9d4SSatish Balay RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { 372d23021a0SBarry Smith RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]); 373d23021a0SBarry Smith d_Iold[k] = d_I[k]; 374d23021a0SBarry Smith }); 375d23021a0SBarry Smith 376d23021a0SBarry Smith resI2 = RAJA_resI2; 377d23021a0SBarry Smith 378d23021a0SBarry Smith if (iteration > maxIter) { 379d23021a0SBarry Smith printf("RAJA: HIP - Maxed out on iterations! \n"); 380d23021a0SBarry Smith exit(-1); 381d23021a0SBarry Smith } 382d23021a0SBarry Smith iteration++; 383d23021a0SBarry Smith } 384d23021a0SBarry Smith hipDeviceSynchronize(); 385d23021a0SBarry Smith hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost)); 386d23021a0SBarry Smith computeErr(I, gridx); 387d23021a0SBarry Smith printf("No of iterations: %d \n \n", iteration); 388d23021a0SBarry Smith 389d23021a0SBarry Smith memoryManager::deallocate_gpu(d_I); 390d23021a0SBarry Smith memoryManager::deallocate_gpu(d_Iold); 391d23021a0SBarry Smith #endif 392d23021a0SBarry Smith 393d23021a0SBarry Smith memoryManager::deallocate(I); 394d23021a0SBarry Smith memoryManager::deallocate(Iold); 395d23021a0SBarry Smith 396d23021a0SBarry Smith return 0; 397d23021a0SBarry Smith } 398d23021a0SBarry Smith 399d23021a0SBarry Smith // 400145b44c9SPierre Jolivet // Function for the analytic solution 401d23021a0SBarry Smith // 402d71ae5a4SJacob Faibussowitsch double solution(double x, double y) 403d71ae5a4SJacob Faibussowitsch { 404d23021a0SBarry Smith return x * y * exp(x - y) * (1 - x) * (1 - y); 405d23021a0SBarry Smith } 406d23021a0SBarry Smith 407d23021a0SBarry Smith // 408d23021a0SBarry Smith // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} 409d23021a0SBarry Smith // 410d71ae5a4SJacob Faibussowitsch void computeErr(double *I, grid_s grid) 411d71ae5a4SJacob Faibussowitsch { 412d23021a0SBarry Smith RAJA::RangeSegment gridRange(0, grid.n); 413d23021a0SBarry Smith RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0); 414d23021a0SBarry Smith 4159371c9d4SSatish Balay using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 416d23021a0SBarry Smith 4179371c9d4SSatish Balay RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) { 418d23021a0SBarry Smith int id = tx + grid.n * ty; 419d23021a0SBarry Smith double x = grid.o + tx * grid.h; 420d23021a0SBarry Smith double y = grid.o + ty * grid.h; 421d23021a0SBarry Smith double myErr = std::abs(I[id] - solution(x, y)); 422d23021a0SBarry Smith tMax.max(myErr); 423d23021a0SBarry Smith }); 424d23021a0SBarry Smith 425d23021a0SBarry Smith double l2err = tMax; 426d23021a0SBarry Smith printf("Max error = %lg, h = %f \n", l2err, grid.h); 427d23021a0SBarry Smith } 428d23021a0SBarry Smith 429d23021a0SBarry Smith /*TEST 430d23021a0SBarry Smith 431d23021a0SBarry Smith test: 432d23021a0SBarry Smith requires: raja !cuda 433*d2b7c016SPierre Jolivet filter: sed -e "/RAJA: OpenMP Policy/,+3d" 434d23021a0SBarry Smith 435d23021a0SBarry Smith test: 436d23021a0SBarry Smith suffix: 2 437d23021a0SBarry Smith requires: raja cuda 438*d2b7c016SPierre Jolivet filter: sed -e "/RAJA: OpenMP Policy/,+3d" 439d23021a0SBarry Smith 440d23021a0SBarry Smith TEST*/ 441