1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC 3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details. 4 // 5 // SPDX-License-Identifier: (BSD-3-Clause) 6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 7 8 #include <cstdlib> 9 #include <cstdio> 10 #include <cstring> 11 12 #include <iostream> 13 #include <cmath> 14 15 #include "RAJA/RAJA.hpp" 16 17 #include "memoryManager.hpp" 18 19 /* 20 * Jacobi Example 21 * 22 * ----[Details]-------------------- 23 * This code uses a five point finite difference stencil 24 * to discretize the following boundary value problem 25 * 26 * U_xx + U_yy = f on [0,1] x [0,1]. 27 * 28 * The right-hand side is chosen to be 29 * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y). 30 * 31 * A structured grid is used to discretize the domain 32 * [0,1] x [0,1]. Values inside the domain are computed 33 * using the Jacobi method to solve the associated 34 * linear system. The scheme is invoked until the l_2 35 * difference of subsequent iterations is below a 36 * tolerance. 37 * 38 * The scheme is implemented by allocating two arrays 39 * (I, Iold) and initialized to zero. The first set of 40 * nested for loops apply an iteration of the Jacobi 41 * scheme. The scheme is only applied to the interior 42 * nodes. 43 * 44 * The second set of nested for loops is used to 45 * update Iold and compute the l_2 norm of the 46 * difference of the iterates. 47 * 48 * Computing the l_2 norm requires a reduction operation. 49 * To simplify the reduction procedure, the RAJA API 50 * introduces thread safe variables. 51 * 52 * ----[RAJA Concepts]--------------- 53 * - Forall::nested loop 54 * - RAJA Reduction 55 * 56 */ 57 58 /* 59 * ----[Constant Values]----- 60 * CUDA_BLOCK_SIZE_X - Number of threads in the 61 * x-dimension of a cuda thread block 62 * 63 * CUDA_BLOCK_SIZE_Y - Number of threads in the 64 * y-dimension of a cuda thread block 65 * 66 * CUDA_BLOCK_SIZE - Number of threads per threads block 67 */ 68 #if defined(RAJA_ENABLE_CUDA) 69 const int CUDA_BLOCK_SIZE = 256; 70 #endif 71 72 #if defined(RAJA_ENABLE_HIP) 73 const int HIP_BLOCK_SIZE = 256; 74 #endif 75 76 // 77 // Struct to hold grid info 78 // o - Origin in a cartesian dimension 79 // h - Spacing between grid points 80 // n - Number of grid points 81 // 82 struct grid_s { 83 double o, h; 84 int n; 85 }; 86 87 // 88 // ----[Functions]--------- 89 // solution - Function for the analytic solution 90 // computeErr - Displays the maximum error in the solution 91 // 92 double solution(double x, double y); 93 void computeErr(double *I, grid_s grid); 94 95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) 96 { 97 98 std::cout<<"Jacobi Example"<<std::endl; 99 100 /* 101 * ----[Solver Parameters]------------ 102 * tol - Method terminates once the norm is less than tol 103 * N - Number of unknown gridpoints per cartesian dimension 104 * NN - Total number of gridpoints on the grid 105 * maxIter - Maximum number of iterations to be taken 106 * 107 * resI2 - Residual 108 * iteration - Iteration number 109 * grid_s - Struct with grid information for a cartesian dimension 110 */ 111 double tol = 1e-10; 112 113 int N = 50; 114 int NN = (N + 2) * (N + 2); 115 int maxIter = 100000; 116 117 double resI2; 118 int iteration; 119 120 grid_s gridx; 121 gridx.o = 0.0; 122 gridx.h = 1.0 / (N + 1.0); 123 gridx.n = N + 2; 124 125 // 126 //I, Iold - Holds iterates of Jacobi method 127 // 128 double *I = memoryManager::allocate<double>(NN); 129 double *Iold = memoryManager::allocate<double>(NN); 130 131 memset(I, 0, NN * sizeof(double)); 132 memset(Iold, 0, NN * sizeof(double)); 133 134 printf("Standard C++ Loop \n"); 135 resI2 = 1; 136 iteration = 0; 137 138 while (resI2 > tol * tol) { 139 140 // 141 // Jacobi Iteration 142 // 143 for (int n = 1; n <= N; ++n) { 144 for (int m = 1; m <= N; ++m) { 145 146 double x = gridx.o + m * gridx.h; 147 double y = gridx.o + n * gridx.h; 148 149 double f = gridx.h * gridx.h 150 * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 151 152 int id = n * (N + 2) + m; 153 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] 154 + Iold[id + 1]); 155 } 156 } 157 158 // 159 // Compute residual and update Iold 160 // 161 resI2 = 0.0; 162 for (int k = 0; k < NN; k++) { 163 resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 164 Iold[k] = I[k]; 165 } 166 167 if (iteration > maxIter) { 168 printf("Standard C++ Loop - Maxed out on iterations \n"); 169 exit(-1); 170 } 171 172 iteration++; 173 } 174 computeErr(I, gridx); 175 printf("No of iterations: %d \n \n", iteration); 176 177 // 178 // RAJA loop calls may be shortened by predefining policies 179 // 180 RAJA::RangeSegment gridRange(0, NN); 181 RAJA::RangeSegment jacobiRange(1, (N + 1)); 182 183 using jacobiSeqNestedPolicy = RAJA::KernelPolicy< 184 RAJA::statement::For<1, RAJA::seq_exec, 185 RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>> > >; 186 187 printf("RAJA: Sequential Policy - Nested ForallN \n"); 188 resI2 = 1; 189 iteration = 0; 190 memset(I, 0, NN * sizeof(double)); 191 memset(Iold, 0, NN * sizeof(double)); 192 193 /* 194 * Sequential Jacobi Iteration. 195 * 196 * Note that a RAJA ReduceSum object is used to accumulate the sum 197 * for the residual. Since the loop is run sequentially, this is 198 * not strictly necessary. It is done here for consistency and 199 * comparison with other RAJA variants in this example. 200 */ 201 while (resI2 > tol * tol) { 202 203 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange), 204 [=] (RAJA::Index_type m, RAJA::Index_type n) { 205 206 double x = gridx.o + m * gridx.h; 207 double y = gridx.o + n * gridx.h; 208 209 double f = gridx.h * gridx.h 210 * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 211 212 int id = n * (N + 2) + m; 213 I[id] = 214 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] 215 + Iold[id + 1]); 216 }); 217 218 RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0); 219 RAJA::forall<RAJA::seq_exec>( 220 gridRange, [=](RAJA::Index_type k) { 221 222 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 223 Iold[k] = I[k]; 224 225 }); 226 227 resI2 = RAJA_resI2; 228 if (iteration > maxIter) { 229 printf("Jacobi: Sequential - Maxed out on iterations! \n"); 230 exit(-1); 231 } 232 iteration++; 233 } 234 computeErr(I, gridx); 235 printf("No of iterations: %d \n \n", iteration); 236 237 #if defined(RAJA_ENABLE_OPENMP) 238 printf("RAJA: OpenMP Policy - Nested ForallN \n"); 239 resI2 = 1; 240 iteration = 0; 241 memset(I, 0, NN * sizeof(double)); 242 memset(Iold, 0, NN * sizeof(double)); 243 244 /* 245 * OpenMP parallel Jacobi Iteration. 246 * 247 * ----[RAJA Policies]----------- 248 * RAJA::omp_collapse_for_exec - 249 * introduced a nested region 250 * 251 * Note that OpenMP RAJA ReduceSum object performs the reduction 252 * operation for the residual in a thread-safe manner. 253 */ 254 255 using jacobiOmpNestedPolicy = RAJA::KernelPolicy< 256 RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 257 RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; 258 259 while (resI2 > tol * tol) { 260 261 RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange,jacobiRange), 262 [=] (RAJA::Index_type m, RAJA::Index_type n) { 263 264 double x = gridx.o + m * gridx.h; 265 double y = gridx.o + n * gridx.h; 266 267 double f = gridx.h * gridx.h * 268 (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 269 270 int id = n * (N + 2) + m; 271 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + 272 Iold[id - 1] + Iold[id + 1]); 273 }); 274 275 RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0); 276 277 RAJA::forall<RAJA::omp_parallel_for_exec>( gridRange, 278 [=](RAJA::Index_type k) { 279 280 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 281 Iold[k] = I[k]; 282 283 }); 284 285 resI2 = RAJA_resI2; 286 if (iteration > maxIter) { 287 printf("Jacobi: OpenMP - Maxed out on iterations! \n"); 288 exit(-1); 289 } 290 iteration++; 291 } 292 computeErr(I, gridx); 293 printf("No of iterations: %d \n \n", iteration); 294 #endif 295 296 #if defined(RAJA_ENABLE_CUDA) 297 /* 298 * CUDA Jacobi Iteration. 299 * 300 * ----[RAJA Policies]----------- 301 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 302 * define the mapping of loop iterations to GPU thread blocks 303 * 304 * Note that CUDA RAJA ReduceSum object performs the reduction 305 * operation for the residual in a thread-safe manner on the GPU. 306 */ 307 308 printf("RAJA: CUDA Policy - Nested ForallN \n"); 309 310 using jacobiCUDANestedPolicy = RAJA::KernelPolicy< 311 RAJA::statement::CudaKernel< 312 RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, 313 RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, 314 RAJA::statement::For<1, RAJA::cuda_thread_y_direct, 315 RAJA::statement::For<0, RAJA::cuda_thread_x_direct, 316 RAJA::statement::Lambda<0> 317 > 318 > 319 > 320 > 321 > >; 322 323 resI2 = 1; 324 iteration = 0; 325 memset(I, 0, NN * sizeof(double)); 326 memset(Iold, 0, NN * sizeof(double)); 327 328 while (resI2 > tol * tol) { 329 330 // 331 // Jacobi Iteration 332 // 333 RAJA::kernel<jacobiCUDANestedPolicy>( 334 RAJA::make_tuple(jacobiRange,jacobiRange), 335 [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) { 336 337 double x = gridx.o + m * gridx.h; 338 double y = gridx.o + n * gridx.h; 339 340 double f = gridx.h * gridx.h 341 * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 342 343 int id = n * (N + 2) + m; 344 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] 345 + Iold[id + 1]); 346 }); 347 348 // 349 // Compute residual and update Iold 350 // 351 RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0); 352 RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>( 353 gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { 354 355 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 356 Iold[k] = I[k]; 357 358 }); 359 360 resI2 = RAJA_resI2; 361 362 if (iteration > maxIter) { 363 printf("RAJA: CUDA - Maxed out on iterations! \n"); 364 exit(-1); 365 } 366 iteration++; 367 } 368 cudaDeviceSynchronize(); 369 computeErr(I, gridx); 370 printf("No of iterations: %d \n \n", iteration); 371 #endif 372 373 #if defined(RAJA_ENABLE_HIP) 374 /* 375 * HIP Jacobi Iteration. 376 * 377 * ----[RAJA Policies]----------- 378 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 379 * define the mapping of loop iterations to GPU thread blocks 380 * 381 * Note that HIP RAJA ReduceSum object performs the reduction 382 * operation for the residual in a thread-safe manner on the GPU. 383 */ 384 385 printf("RAJA: HIP Policy - Nested ForallN \n"); 386 387 using jacobiHIPNestedPolicy = RAJA::KernelPolicy< 388 RAJA::statement::HipKernel< 389 RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, 390 RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, 391 RAJA::statement::For<1, RAJA::hip_thread_y_direct, 392 RAJA::statement::For<0, RAJA::hip_thread_x_direct, 393 RAJA::statement::Lambda<0> 394 > 395 > 396 > 397 > 398 > >; 399 400 resI2 = 1; 401 iteration = 0; 402 memset(I, 0, NN * sizeof(double)); 403 memset(Iold, 0, NN * sizeof(double)); 404 405 double *d_I = memoryManager::allocate_gpu<double>(NN); 406 double *d_Iold = memoryManager::allocate_gpu<double>(NN); 407 hipErrchk(hipMemcpy( d_I, I, NN * sizeof(double), hipMemcpyHostToDevice)); 408 hipErrchk(hipMemcpy( d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice)); 409 410 while (resI2 > tol * tol) { 411 412 // 413 // Jacobi Iteration 414 // 415 RAJA::kernel<jacobiHIPNestedPolicy>( 416 RAJA::make_tuple(jacobiRange,jacobiRange), 417 [=] RAJA_DEVICE (RAJA::Index_type m, RAJA::Index_type n) { 418 419 double x = gridx.o + m * gridx.h; 420 double y = gridx.o + n * gridx.h; 421 422 double f = gridx.h * gridx.h 423 * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 424 425 int id = n * (N + 2) + m; 426 d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] 427 + d_Iold[id + 1]); 428 }); 429 430 // 431 // Compute residual and update Iold 432 // 433 RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0); 434 RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>( 435 gridRange, [=] RAJA_DEVICE (RAJA::Index_type k) { 436 437 RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]); 438 d_Iold[k] = d_I[k]; 439 440 }); 441 442 resI2 = RAJA_resI2; 443 444 if (iteration > maxIter) { 445 printf("RAJA: HIP - Maxed out on iterations! \n"); 446 exit(-1); 447 } 448 iteration++; 449 } 450 hipDeviceSynchronize(); 451 hipErrchk(hipMemcpy( I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost)); 452 computeErr(I, gridx); 453 printf("No of iterations: %d \n \n", iteration); 454 455 memoryManager::deallocate_gpu(d_I); 456 memoryManager::deallocate_gpu(d_Iold); 457 #endif 458 459 memoryManager::deallocate(I); 460 memoryManager::deallocate(Iold); 461 462 return 0; 463 } 464 465 // 466 // Function for the anlytic solution 467 // 468 double solution(double x, double y) 469 { 470 return x * y * exp(x - y) * (1 - x) * (1 - y); 471 } 472 473 // 474 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} 475 // 476 void computeErr(double *I, grid_s grid) 477 { 478 479 RAJA::RangeSegment gridRange(0, grid.n); 480 RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0); 481 482 using jacobiSeqNestedPolicy = RAJA::KernelPolicy< 483 RAJA::statement::For<1, RAJA::seq_exec, 484 RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0> > > >; 485 486 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange,gridRange), 487 [=] (RAJA::Index_type ty, RAJA::Index_type tx) { 488 489 int id = tx + grid.n * ty; 490 double x = grid.o + tx * grid.h; 491 double y = grid.o + ty * grid.h; 492 double myErr = std::abs(I[id] - solution(x, y)); 493 tMax.max(myErr); 494 }); 495 496 double l2err = tMax; 497 printf("Max error = %lg, h = %f \n", l2err, grid.h); 498 } 499 500 /*TEST 501 502 test: 503 requires: raja !cuda 504 505 test: 506 suffix: 2 507 requires: raja cuda 508 509 TEST*/ 510