1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC 3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details. 4 // 5 // SPDX-License-Identifier: (BSD-3-Clause) 6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 7 8 #include <cstdlib> 9 #include <cstdio> 10 #include <cstring> 11 12 #include <iostream> 13 #include <cmath> 14 15 #include "RAJA/RAJA.hpp" 16 17 #include "memoryManager.hpp" 18 19 /* 20 * Jacobi Example 21 * 22 * ----[Details]-------------------- 23 * This code uses a five point finite difference stencil 24 * to discretize the following boundary value problem 25 * 26 * U_xx + U_yy = f on [0,1] x [0,1]. 27 * 28 * The right-hand side is chosen to be 29 * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y). 30 * 31 * A structured grid is used to discretize the domain 32 * [0,1] x [0,1]. Values inside the domain are computed 33 * using the Jacobi method to solve the associated 34 * linear system. The scheme is invoked until the l_2 35 * difference of subsequent iterations is below a 36 * tolerance. 37 * 38 * The scheme is implemented by allocating two arrays 39 * (I, Iold) and initialized to zero. The first set of 40 * nested for loops apply an iteration of the Jacobi 41 * scheme. The scheme is only applied to the interior 42 * nodes. 43 * 44 * The second set of nested for loops is used to 45 * update Iold and compute the l_2 norm of the 46 * difference of the iterates. 47 * 48 * Computing the l_2 norm requires a reduction operation. 49 * To simplify the reduction procedure, the RAJA API 50 * introduces thread safe variables. 51 * 52 * ----[RAJA Concepts]--------------- 53 * - Forall::nested loop 54 * - RAJA Reduction 55 * 56 */ 57 58 /* 59 * ----[Constant Values]----- 60 * CUDA_BLOCK_SIZE_X - Number of threads in the 61 * x-dimension of a cuda thread block 62 * 63 * CUDA_BLOCK_SIZE_Y - Number of threads in the 64 * y-dimension of a cuda thread block 65 * 66 * CUDA_BLOCK_SIZE - Number of threads per threads block 67 */ 68 #if defined(RAJA_ENABLE_CUDA) 69 const int CUDA_BLOCK_SIZE = 256; 70 #endif 71 72 #if defined(RAJA_ENABLE_HIP) 73 const int HIP_BLOCK_SIZE = 256; 74 #endif 75 76 // 77 // Struct to hold grid info 78 // o - Origin in a cartesian dimension 79 // h - Spacing between grid points 80 // n - Number of grid points 81 // 82 struct grid_s { 83 double o, h; 84 int n; 85 }; 86 87 // 88 // ----[Functions]--------- 89 // solution - Function for the analytic solution 90 // computeErr - Displays the maximum error in the solution 91 // 92 double solution(double x, double y); 93 void computeErr(double *I, grid_s grid); 94 95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { 96 std::cout << "Jacobi Example" << std::endl; 97 98 /* 99 * ----[Solver Parameters]------------ 100 * tol - Method terminates once the norm is less than tol 101 * N - Number of unknown gridpoints per cartesian dimension 102 * NN - Total number of gridpoints on the grid 103 * maxIter - Maximum number of iterations to be taken 104 * 105 * resI2 - Residual 106 * iteration - Iteration number 107 * grid_s - Struct with grid information for a cartesian dimension 108 */ 109 double tol = 1e-10; 110 111 int N = 50; 112 int NN = (N + 2) * (N + 2); 113 int maxIter = 100000; 114 115 double resI2; 116 int iteration; 117 118 grid_s gridx; 119 gridx.o = 0.0; 120 gridx.h = 1.0 / (N + 1.0); 121 gridx.n = N + 2; 122 123 // 124 //I, Iold - Holds iterates of Jacobi method 125 // 126 double *I = memoryManager::allocate<double>(NN); 127 double *Iold = memoryManager::allocate<double>(NN); 128 129 memset(I, 0, NN * sizeof(double)); 130 memset(Iold, 0, NN * sizeof(double)); 131 132 printf("Standard C++ Loop \n"); 133 resI2 = 1; 134 iteration = 0; 135 136 while (resI2 > tol * tol) { 137 // 138 // Jacobi Iteration 139 // 140 for (int n = 1; n <= N; ++n) { 141 for (int m = 1; m <= N; ++m) { 142 double x = gridx.o + m * gridx.h; 143 double y = gridx.o + n * gridx.h; 144 145 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 146 147 int id = n * (N + 2) + m; 148 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 149 } 150 } 151 152 // 153 // Compute residual and update Iold 154 // 155 resI2 = 0.0; 156 for (int k = 0; k < NN; k++) { 157 resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 158 Iold[k] = I[k]; 159 } 160 161 if (iteration > maxIter) { 162 printf("Standard C++ Loop - Maxed out on iterations \n"); 163 exit(-1); 164 } 165 166 iteration++; 167 } 168 computeErr(I, gridx); 169 printf("No of iterations: %d \n \n", iteration); 170 171 // 172 // RAJA loop calls may be shortened by predefining policies 173 // 174 RAJA::RangeSegment gridRange(0, NN); 175 RAJA::RangeSegment jacobiRange(1, (N + 1)); 176 177 using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 178 179 printf("RAJA: Sequential Policy - Nested ForallN \n"); 180 resI2 = 1; 181 iteration = 0; 182 memset(I, 0, NN * sizeof(double)); 183 memset(Iold, 0, NN * sizeof(double)); 184 185 /* 186 * Sequential Jacobi Iteration. 187 * 188 * Note that a RAJA ReduceSum object is used to accumulate the sum 189 * for the residual. Since the loop is run sequentially, this is 190 * not strictly necessary. It is done here for consistency and 191 * comparison with other RAJA variants in this example. 192 */ 193 while (resI2 > tol * tol) { 194 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) { 195 double x = gridx.o + m * gridx.h; 196 double y = gridx.o + n * gridx.h; 197 198 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 199 200 int id = n * (N + 2) + m; 201 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 202 }); 203 204 RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0); 205 RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) { 206 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 207 Iold[k] = I[k]; 208 }); 209 210 resI2 = RAJA_resI2; 211 if (iteration > maxIter) { 212 printf("Jacobi: Sequential - Maxed out on iterations! \n"); 213 exit(-1); 214 } 215 iteration++; 216 } 217 computeErr(I, gridx); 218 printf("No of iterations: %d \n \n", iteration); 219 220 #if defined(RAJA_ENABLE_OPENMP) 221 printf("RAJA: OpenMP Policy - Nested ForallN \n"); 222 resI2 = 1; 223 iteration = 0; 224 memset(I, 0, NN * sizeof(double)); 225 memset(Iold, 0, NN * sizeof(double)); 226 227 /* 228 * OpenMP parallel Jacobi Iteration. 229 * 230 * ----[RAJA Policies]----------- 231 * RAJA::omp_collapse_for_exec - 232 * introduced a nested region 233 * 234 * Note that OpenMP RAJA ReduceSum object performs the reduction 235 * operation for the residual in a thread-safe manner. 236 */ 237 238 using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 239 240 while (resI2 > tol * tol) { 241 RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) { 242 double x = gridx.o + m * gridx.h; 243 double y = gridx.o + n * gridx.h; 244 245 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 246 247 int id = n * (N + 2) + m; 248 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 249 }); 250 251 RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0); 252 253 RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) { 254 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 255 Iold[k] = I[k]; 256 }); 257 258 resI2 = RAJA_resI2; 259 if (iteration > maxIter) { 260 printf("Jacobi: OpenMP - Maxed out on iterations! \n"); 261 exit(-1); 262 } 263 iteration++; 264 } 265 computeErr(I, gridx); 266 printf("No of iterations: %d \n \n", iteration); 267 #endif 268 269 #if defined(RAJA_ENABLE_CUDA) 270 /* 271 * CUDA Jacobi Iteration. 272 * 273 * ----[RAJA Policies]----------- 274 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 275 * define the mapping of loop iterations to GPU thread blocks 276 * 277 * Note that CUDA RAJA ReduceSum object performs the reduction 278 * operation for the residual in a thread-safe manner on the GPU. 279 */ 280 281 printf("RAJA: CUDA Policy - Nested ForallN \n"); 282 283 using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>; 284 285 resI2 = 1; 286 iteration = 0; 287 memset(I, 0, NN * sizeof(double)); 288 memset(Iold, 0, NN * sizeof(double)); 289 290 while (resI2 > tol * tol) { 291 // 292 // Jacobi Iteration 293 // 294 RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { 295 double x = gridx.o + m * gridx.h; 296 double y = gridx.o + n * gridx.h; 297 298 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 299 300 int id = n * (N + 2) + m; 301 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 302 }); 303 304 // 305 // Compute residual and update Iold 306 // 307 RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0); 308 RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { 309 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 310 Iold[k] = I[k]; 311 }); 312 313 resI2 = RAJA_resI2; 314 315 if (iteration > maxIter) { 316 printf("RAJA: CUDA - Maxed out on iterations! \n"); 317 exit(-1); 318 } 319 iteration++; 320 } 321 cudaDeviceSynchronize(); 322 computeErr(I, gridx); 323 printf("No of iterations: %d \n \n", iteration); 324 #endif 325 326 #if defined(RAJA_ENABLE_HIP) 327 /* 328 * HIP Jacobi Iteration. 329 * 330 * ----[RAJA Policies]----------- 331 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 332 * define the mapping of loop iterations to GPU thread blocks 333 * 334 * Note that HIP RAJA ReduceSum object performs the reduction 335 * operation for the residual in a thread-safe manner on the GPU. 336 */ 337 338 printf("RAJA: HIP Policy - Nested ForallN \n"); 339 340 using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>; 341 342 resI2 = 1; 343 iteration = 0; 344 memset(I, 0, NN * sizeof(double)); 345 memset(Iold, 0, NN * sizeof(double)); 346 347 double *d_I = memoryManager::allocate_gpu<double>(NN); 348 double *d_Iold = memoryManager::allocate_gpu<double>(NN); 349 hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice)); 350 hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice)); 351 352 while (resI2 > tol * tol) { 353 // 354 // Jacobi Iteration 355 // 356 RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { 357 double x = gridx.o + m * gridx.h; 358 double y = gridx.o + n * gridx.h; 359 360 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 361 362 int id = n * (N + 2) + m; 363 d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]); 364 }); 365 366 // 367 // Compute residual and update Iold 368 // 369 RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0); 370 RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { 371 RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]); 372 d_Iold[k] = d_I[k]; 373 }); 374 375 resI2 = RAJA_resI2; 376 377 if (iteration > maxIter) { 378 printf("RAJA: HIP - Maxed out on iterations! \n"); 379 exit(-1); 380 } 381 iteration++; 382 } 383 hipDeviceSynchronize(); 384 hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost)); 385 computeErr(I, gridx); 386 printf("No of iterations: %d \n \n", iteration); 387 388 memoryManager::deallocate_gpu(d_I); 389 memoryManager::deallocate_gpu(d_Iold); 390 #endif 391 392 memoryManager::deallocate(I); 393 memoryManager::deallocate(Iold); 394 395 return 0; 396 } 397 398 // 399 // Function for the anlytic solution 400 // 401 double solution(double x, double y) { 402 return x * y * exp(x - y) * (1 - x) * (1 - y); 403 } 404 405 // 406 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} 407 // 408 void computeErr(double *I, grid_s grid) { 409 RAJA::RangeSegment gridRange(0, grid.n); 410 RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0); 411 412 using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 413 414 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) { 415 int id = tx + grid.n * ty; 416 double x = grid.o + tx * grid.h; 417 double y = grid.o + ty * grid.h; 418 double myErr = std::abs(I[id] - solution(x, y)); 419 tMax.max(myErr); 420 }); 421 422 double l2err = tMax; 423 printf("Max error = %lg, h = %f \n", l2err, grid.h); 424 } 425 426 /*TEST 427 428 test: 429 requires: raja !cuda 430 431 test: 432 suffix: 2 433 requires: raja cuda 434 435 TEST*/ 436