1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC 3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details. 4 // 5 // SPDX-License-Identifier: (BSD-3-Clause) 6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// 7 8 #include <cstdlib> 9 #include <cstdio> 10 #include <cstring> 11 12 #include <iostream> 13 #include <cmath> 14 15 #include "RAJA/RAJA.hpp" 16 17 #include "memoryManager.hpp" 18 19 /* 20 * Jacobi Example 21 * 22 * ----[Details]-------------------- 23 * This code uses a five point finite difference stencil 24 * to discretize the following boundary value problem 25 * 26 * U_xx + U_yy = f on [0,1] x [0,1]. 27 * 28 * The right-hand side is chosen to be 29 * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y). 30 * 31 * A structured grid is used to discretize the domain 32 * [0,1] x [0,1]. Values inside the domain are computed 33 * using the Jacobi method to solve the associated 34 * linear system. The scheme is invoked until the l_2 35 * difference of subsequent iterations is below a 36 * tolerance. 37 * 38 * The scheme is implemented by allocating two arrays 39 * (I, Iold) and initialized to zero. The first set of 40 * nested for loops apply an iteration of the Jacobi 41 * scheme. The scheme is only applied to the interior 42 * nodes. 43 * 44 * The second set of nested for loops is used to 45 * update Iold and compute the l_2 norm of the 46 * difference of the iterates. 47 * 48 * Computing the l_2 norm requires a reduction operation. 49 * To simplify the reduction procedure, the RAJA API 50 * introduces thread safe variables. 51 * 52 * ----[RAJA Concepts]--------------- 53 * - Forall::nested loop 54 * - RAJA Reduction 55 * 56 */ 57 58 /* 59 * ----[Constant Values]----- 60 * CUDA_BLOCK_SIZE_X - Number of threads in the 61 * x-dimension of a cuda thread block 62 * 63 * CUDA_BLOCK_SIZE_Y - Number of threads in the 64 * y-dimension of a cuda thread block 65 * 66 * CUDA_BLOCK_SIZE - Number of threads per threads block 67 */ 68 #if defined(RAJA_ENABLE_CUDA) 69 const int CUDA_BLOCK_SIZE = 256; 70 #endif 71 72 #if defined(RAJA_ENABLE_HIP) 73 const int HIP_BLOCK_SIZE = 256; 74 #endif 75 76 // 77 // Struct to hold grid info 78 // o - Origin in a cartesian dimension 79 // h - Spacing between grid points 80 // n - Number of grid points 81 // 82 struct grid_s { 83 double o, h; 84 int n; 85 }; 86 87 // 88 // ----[Functions]--------- 89 // solution - Function for the analytic solution 90 // computeErr - Displays the maximum error in the solution 91 // 92 double solution(double x, double y); 93 void computeErr(double *I, grid_s grid); 94 95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) 96 { 97 std::cout << "Jacobi Example" << std::endl; 98 99 /* 100 * ----[Solver Parameters]------------ 101 * tol - Method terminates once the norm is less than tol 102 * N - Number of unknown gridpoints per cartesian dimension 103 * NN - Total number of gridpoints on the grid 104 * maxIter - Maximum number of iterations to be taken 105 * 106 * resI2 - Residual 107 * iteration - Iteration number 108 * grid_s - Struct with grid information for a cartesian dimension 109 */ 110 double tol = 1e-10; 111 112 int N = 50; 113 int NN = (N + 2) * (N + 2); 114 int maxIter = 100000; 115 116 double resI2; 117 int iteration; 118 119 grid_s gridx; 120 gridx.o = 0.0; 121 gridx.h = 1.0 / (N + 1.0); 122 gridx.n = N + 2; 123 124 // 125 //I, Iold - Holds iterates of Jacobi method 126 // 127 double *I = memoryManager::allocate<double>(NN); 128 double *Iold = memoryManager::allocate<double>(NN); 129 130 memset(I, 0, NN * sizeof(double)); 131 memset(Iold, 0, NN * sizeof(double)); 132 133 printf("Standard C++ Loop \n"); 134 resI2 = 1; 135 iteration = 0; 136 137 while (resI2 > tol * tol) { 138 // 139 // Jacobi Iteration 140 // 141 for (int n = 1; n <= N; ++n) { 142 for (int m = 1; m <= N; ++m) { 143 double x = gridx.o + m * gridx.h; 144 double y = gridx.o + n * gridx.h; 145 146 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 147 148 int id = n * (N + 2) + m; 149 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 150 } 151 } 152 153 // 154 // Compute residual and update Iold 155 // 156 resI2 = 0.0; 157 for (int k = 0; k < NN; k++) { 158 resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 159 Iold[k] = I[k]; 160 } 161 162 if (iteration > maxIter) { 163 printf("Standard C++ Loop - Maxed out on iterations \n"); 164 exit(-1); 165 } 166 167 iteration++; 168 } 169 computeErr(I, gridx); 170 printf("No of iterations: %d \n \n", iteration); 171 172 // 173 // RAJA loop calls may be shortened by predefining policies 174 // 175 RAJA::RangeSegment gridRange(0, NN); 176 RAJA::RangeSegment jacobiRange(1, N + 1); 177 178 using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 179 180 printf("RAJA: Sequential Policy - Nested ForallN \n"); 181 resI2 = 1; 182 iteration = 0; 183 memset(I, 0, NN * sizeof(double)); 184 memset(Iold, 0, NN * sizeof(double)); 185 186 /* 187 * Sequential Jacobi Iteration. 188 * 189 * Note that a RAJA ReduceSum object is used to accumulate the sum 190 * for the residual. Since the loop is run sequentially, this is 191 * not strictly necessary. It is done here for consistency and 192 * comparison with other RAJA variants in this example. 193 */ 194 while (resI2 > tol * tol) { 195 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) { 196 double x = gridx.o + m * gridx.h; 197 double y = gridx.o + n * gridx.h; 198 199 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 200 201 int id = n * (N + 2) + m; 202 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 203 }); 204 205 RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0); 206 RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) { 207 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 208 Iold[k] = I[k]; 209 }); 210 211 resI2 = RAJA_resI2; 212 if (iteration > maxIter) { 213 printf("Jacobi: Sequential - Maxed out on iterations! \n"); 214 exit(-1); 215 } 216 iteration++; 217 } 218 computeErr(I, gridx); 219 printf("No of iterations: %d \n \n", iteration); 220 221 #if defined(RAJA_ENABLE_OPENMP) 222 printf("RAJA: OpenMP Policy - Nested ForallN \n"); 223 resI2 = 1; 224 iteration = 0; 225 memset(I, 0, NN * sizeof(double)); 226 memset(Iold, 0, NN * sizeof(double)); 227 228 /* 229 * OpenMP parallel Jacobi Iteration. 230 * 231 * ----[RAJA Policies]----------- 232 * RAJA::omp_collapse_for_exec - 233 * introduced a nested region 234 * 235 * Note that OpenMP RAJA ReduceSum object performs the reduction 236 * operation for the residual in a thread-safe manner. 237 */ 238 239 using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 240 241 while (resI2 > tol * tol) { 242 RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) { 243 double x = gridx.o + m * gridx.h; 244 double y = gridx.o + n * gridx.h; 245 246 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 247 248 int id = n * (N + 2) + m; 249 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 250 }); 251 252 RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0); 253 254 RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) { 255 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 256 Iold[k] = I[k]; 257 }); 258 259 resI2 = RAJA_resI2; 260 if (iteration > maxIter) { 261 printf("Jacobi: OpenMP - Maxed out on iterations! \n"); 262 exit(-1); 263 } 264 iteration++; 265 } 266 computeErr(I, gridx); 267 printf("No of iterations: %d \n \n", iteration); 268 #endif 269 270 #if defined(RAJA_ENABLE_CUDA) 271 /* 272 * CUDA Jacobi Iteration. 273 * 274 * ----[RAJA Policies]----------- 275 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 276 * define the mapping of loop iterations to GPU thread blocks 277 * 278 * Note that CUDA RAJA ReduceSum object performs the reduction 279 * operation for the residual in a thread-safe manner on the GPU. 280 */ 281 282 printf("RAJA: CUDA Policy - Nested ForallN \n"); 283 284 using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>; 285 286 resI2 = 1; 287 iteration = 0; 288 memset(I, 0, NN * sizeof(double)); 289 memset(Iold, 0, NN * sizeof(double)); 290 291 while (resI2 > tol * tol) { 292 // 293 // Jacobi Iteration 294 // 295 RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { 296 double x = gridx.o + m * gridx.h; 297 double y = gridx.o + n * gridx.h; 298 299 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 300 301 int id = n * (N + 2) + m; 302 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]); 303 }); 304 305 // 306 // Compute residual and update Iold 307 // 308 RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0); 309 RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { 310 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]); 311 Iold[k] = I[k]; 312 }); 313 314 resI2 = RAJA_resI2; 315 316 if (iteration > maxIter) { 317 printf("RAJA: CUDA - Maxed out on iterations! \n"); 318 exit(-1); 319 } 320 iteration++; 321 } 322 cudaDeviceSynchronize(); 323 computeErr(I, gridx); 324 printf("No of iterations: %d \n \n", iteration); 325 #endif 326 327 #if defined(RAJA_ENABLE_HIP) 328 /* 329 * HIP Jacobi Iteration. 330 * 331 * ----[RAJA Policies]----------- 332 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec - 333 * define the mapping of loop iterations to GPU thread blocks 334 * 335 * Note that HIP RAJA ReduceSum object performs the reduction 336 * operation for the residual in a thread-safe manner on the GPU. 337 */ 338 339 printf("RAJA: HIP Policy - Nested ForallN \n"); 340 341 using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>; 342 343 resI2 = 1; 344 iteration = 0; 345 memset(I, 0, NN * sizeof(double)); 346 memset(Iold, 0, NN * sizeof(double)); 347 348 double *d_I = memoryManager::allocate_gpu<double>(NN); 349 double *d_Iold = memoryManager::allocate_gpu<double>(NN); 350 hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice)); 351 hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice)); 352 353 while (resI2 > tol * tol) { 354 // 355 // Jacobi Iteration 356 // 357 RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) { 358 double x = gridx.o + m * gridx.h; 359 double y = gridx.o + n * gridx.h; 360 361 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y)); 362 363 int id = n * (N + 2) + m; 364 d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]); 365 }); 366 367 // 368 // Compute residual and update Iold 369 // 370 RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0); 371 RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) { 372 RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]); 373 d_Iold[k] = d_I[k]; 374 }); 375 376 resI2 = RAJA_resI2; 377 378 if (iteration > maxIter) { 379 printf("RAJA: HIP - Maxed out on iterations! \n"); 380 exit(-1); 381 } 382 iteration++; 383 } 384 hipDeviceSynchronize(); 385 hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost)); 386 computeErr(I, gridx); 387 printf("No of iterations: %d \n \n", iteration); 388 389 memoryManager::deallocate_gpu(d_I); 390 memoryManager::deallocate_gpu(d_Iold); 391 #endif 392 393 memoryManager::deallocate(I); 394 memoryManager::deallocate(Iold); 395 396 return 0; 397 } 398 399 // 400 // Function for the analytic solution 401 // 402 double solution(double x, double y) 403 { 404 return x * y * exp(x - y) * (1 - x) * (1 - y); 405 } 406 407 // 408 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf} 409 // 410 void computeErr(double *I, grid_s grid) 411 { 412 RAJA::RangeSegment gridRange(0, grid.n); 413 RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0); 414 415 using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>; 416 417 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) { 418 int id = tx + grid.n * ty; 419 double x = grid.o + tx * grid.h; 420 double y = grid.o + ty * grid.h; 421 double myErr = std::abs(I[id] - solution(x, y)); 422 tMax.max(myErr); 423 }); 424 425 double l2err = tMax; 426 printf("Max error = %lg, h = %f \n", l2err, grid.h); 427 } 428 429 /*TEST 430 431 test: 432 requires: raja !cuda 433 filter: sed -e "/RAJA: OpenMP Policy/,+3d" 434 435 test: 436 suffix: 2 437 requires: raja cuda 438 filter: sed -e "/RAJA: OpenMP Policy/,+3d" 439 440 TEST*/ 441