tests/raja/ex1.raja.cxx

d23021a0SBarry Smith//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
d23021a0SBarry Smith// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
d23021a0SBarry Smith// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
d23021a0SBarry Smith//
d23021a0SBarry Smith// SPDX-License-Identifier: (BSD-3-Clause)
d23021a0SBarry Smith//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
d23021a0SBarry Smith
d23021a0SBarry Smith#include <cstdlib>
d23021a0SBarry Smith#include <cstdio>
d23021a0SBarry Smith#include <cstring>
d23021a0SBarry Smith
d23021a0SBarry Smith#include <iostream>
d23021a0SBarry Smith#include <cmath>
d23021a0SBarry Smith
d23021a0SBarry Smith#include "RAJA/RAJA.hpp"
d23021a0SBarry Smith
d23021a0SBarry Smith#include "memoryManager.hpp"
d23021a0SBarry Smith
d23021a0SBarry Smith/*
d23021a0SBarry Smith * Jacobi Example
d23021a0SBarry Smith *
d23021a0SBarry Smith * ----[Details]--------------------
d23021a0SBarry Smith * This code uses a five point finite difference stencil
d23021a0SBarry Smith * to discretize the following boundary value problem
d23021a0SBarry Smith *
d23021a0SBarry Smith * U_xx + U_yy = f on [0,1] x [0,1].
d23021a0SBarry Smith *
d23021a0SBarry Smith * The right-hand side is chosen to be
d23021a0SBarry Smith * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
d23021a0SBarry Smith *
d23021a0SBarry Smith * A structured grid is used to discretize the domain
d23021a0SBarry Smith * [0,1] x [0,1]. Values inside the domain are computed
d23021a0SBarry Smith * using the Jacobi method to solve the associated
d23021a0SBarry Smith * linear system. The scheme is invoked until the l_2
d23021a0SBarry Smith * difference of subsequent iterations is below a
d23021a0SBarry Smith * tolerance.
d23021a0SBarry Smith *
d23021a0SBarry Smith * The scheme is implemented by allocating two arrays
d23021a0SBarry Smith * (I, Iold) and initialized to zero. The first set of
d23021a0SBarry Smith * nested for loops apply an iteration of the Jacobi
d23021a0SBarry Smith * scheme. The scheme is only applied to the interior
d23021a0SBarry Smith * nodes.
d23021a0SBarry Smith *
d23021a0SBarry Smith * The second set of nested for loops is used to
d23021a0SBarry Smith * update Iold and compute the l_2 norm of the
d23021a0SBarry Smith * difference of the iterates.
d23021a0SBarry Smith *
d23021a0SBarry Smith * Computing the l_2 norm requires a reduction operation.
d23021a0SBarry Smith * To simplify the reduction procedure, the RAJA API
d23021a0SBarry Smith * introduces thread safe variables.
d23021a0SBarry Smith *
d23021a0SBarry Smith * ----[RAJA Concepts]---------------
d23021a0SBarry Smith * - Forall::nested loop
d23021a0SBarry Smith * - RAJA Reduction
d23021a0SBarry Smith *
d23021a0SBarry Smith */
d23021a0SBarry Smith
d23021a0SBarry Smith/*
d23021a0SBarry Smith *  ----[Constant Values]-----
d23021a0SBarry Smith * CUDA_BLOCK_SIZE_X - Number of threads in the
d23021a0SBarry Smith *                     x-dimension of a cuda thread block
d23021a0SBarry Smith *
d23021a0SBarry Smith * CUDA_BLOCK_SIZE_Y - Number of threads in the
d23021a0SBarry Smith *                     y-dimension of a cuda thread block
d23021a0SBarry Smith *
d23021a0SBarry Smith * CUDA_BLOCK_SIZE   - Number of threads per threads block
d23021a0SBarry Smith*/
d23021a0SBarry Smith#if defined(RAJA_ENABLE_CUDA)
d23021a0SBarry Smithconst int CUDA_BLOCK_SIZE = 256;
d23021a0SBarry Smith#endif
d23021a0SBarry Smith
d23021a0SBarry Smith#if defined(RAJA_ENABLE_HIP)
d23021a0SBarry Smithconst int HIP_BLOCK_SIZE = 256;
d23021a0SBarry Smith#endif
d23021a0SBarry Smith
d23021a0SBarry Smith//
d23021a0SBarry Smith//  Struct to hold grid info
d23021a0SBarry Smith//  o - Origin in a cartesian dimension
d23021a0SBarry Smith//  h - Spacing between grid points
d23021a0SBarry Smith//  n - Number of grid points
d23021a0SBarry Smith//
d23021a0SBarry Smithstruct grid_s {
d23021a0SBarry Smith  double o, h;
d23021a0SBarry Smith  int    n;
d23021a0SBarry Smith};
d23021a0SBarry Smith
d23021a0SBarry Smith//
d23021a0SBarry Smith// ----[Functions]---------
d23021a0SBarry Smith// solution   - Function for the analytic solution
d23021a0SBarry Smith// computeErr - Displays the maximum error in the solution
d23021a0SBarry Smith//
d23021a0SBarry Smithdouble solution(double x, double y);
d23021a0SBarry Smithvoid   computeErr(double *I, grid_s grid);
d23021a0SBarry Smith
d71ae5a4SJacob Faibussowitschint main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
d71ae5a4SJacob Faibussowitsch{
d23021a0SBarry Smith  std::cout << "Jacobi Example" << std::endl;
d23021a0SBarry Smith
d23021a0SBarry Smith  /*
d23021a0SBarry Smith   * ----[Solver Parameters]------------
d23021a0SBarry Smith   * tol       - Method terminates once the norm is less than tol
d23021a0SBarry Smith   * N         - Number of unknown gridpoints per cartesian dimension
d23021a0SBarry Smith   * NN        - Total number of gridpoints on the grid
d23021a0SBarry Smith   * maxIter   - Maximum number of iterations to be taken
d23021a0SBarry Smith   *
d23021a0SBarry Smith   * resI2     - Residual
d23021a0SBarry Smith   * iteration - Iteration number
d23021a0SBarry Smith   * grid_s    - Struct with grid information for a cartesian dimension
d23021a0SBarry Smith  */
d23021a0SBarry Smith  double tol = 1e-10;
d23021a0SBarry Smith
d23021a0SBarry Smith  int N       = 50;
d23021a0SBarry Smith  int NN      = (N + 2) * (N + 2);
d23021a0SBarry Smith  int maxIter = 100000;
d23021a0SBarry Smith
d23021a0SBarry Smith  double resI2;
d23021a0SBarry Smith  int    iteration;
d23021a0SBarry Smith
d23021a0SBarry Smith  grid_s gridx;
d23021a0SBarry Smith  gridx.o = 0.0;
d23021a0SBarry Smith  gridx.h = 1.0 / (N + 1.0);
d23021a0SBarry Smith  gridx.n = N + 2;
d23021a0SBarry Smith
d23021a0SBarry Smith  //
d23021a0SBarry Smith  //I, Iold - Holds iterates of Jacobi method
d23021a0SBarry Smith  //
d23021a0SBarry Smith  double *I    = memoryManager::allocate<double>(NN);
d23021a0SBarry Smith  double *Iold = memoryManager::allocate<double>(NN);
d23021a0SBarry Smith
d23021a0SBarry Smith  memset(I, 0, NN * sizeof(double));
d23021a0SBarry Smith  memset(Iold, 0, NN * sizeof(double));
d23021a0SBarry Smith
d23021a0SBarry Smith  printf("Standard  C++ Loop \n");
d23021a0SBarry Smith  resI2     = 1;
d23021a0SBarry Smith  iteration = 0;
d23021a0SBarry Smith
d23021a0SBarry Smith  while (resI2 > tol * tol) {
d23021a0SBarry Smith    //
d23021a0SBarry Smith    // Jacobi Iteration
d23021a0SBarry Smith    //
d23021a0SBarry Smith    for (int n = 1; n <= N; ++n) {
d23021a0SBarry Smith      for (int m = 1; m <= N; ++m) {
d23021a0SBarry Smith        double x = gridx.o + m * gridx.h;
d23021a0SBarry Smith        double y = gridx.o + n * gridx.h;
d23021a0SBarry Smith
9371c9d4SSatish Balay        double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
d23021a0SBarry Smith
d23021a0SBarry Smith        int id = n * (N + 2) + m;
9371c9d4SSatish Balay        I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
d23021a0SBarry Smith      }
d23021a0SBarry Smith    }
d23021a0SBarry Smith
d23021a0SBarry Smith    //
d23021a0SBarry Smith    // Compute residual and update Iold
d23021a0SBarry Smith    //
d23021a0SBarry Smith    resI2 = 0.0;
d23021a0SBarry Smith    for (int k = 0; k < NN; k++) {
d23021a0SBarry Smith      resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
d23021a0SBarry Smith      Iold[k] = I[k];
d23021a0SBarry Smith    }
d23021a0SBarry Smith
d23021a0SBarry Smith    if (iteration > maxIter) {
d23021a0SBarry Smith      printf("Standard C++ Loop - Maxed out on iterations \n");
d23021a0SBarry Smith      exit(-1);
d23021a0SBarry Smith    }
d23021a0SBarry Smith
d23021a0SBarry Smith    iteration++;
d23021a0SBarry Smith  }
d23021a0SBarry Smith  computeErr(I, gridx);
d23021a0SBarry Smith  printf("No of iterations: %d \n \n", iteration);
d23021a0SBarry Smith
d23021a0SBarry Smith  //
d23021a0SBarry Smith  // RAJA loop calls may be shortened by predefining policies
d23021a0SBarry Smith  //
d23021a0SBarry Smith  RAJA::RangeSegment gridRange(0, NN);
57508eceSPierre Jolivet  RAJA::RangeSegment jacobiRange(1, N + 1);
d23021a0SBarry Smith
9371c9d4SSatish Balay  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
d23021a0SBarry Smith
d23021a0SBarry Smith  printf("RAJA: Sequential Policy - Nested ForallN \n");
d23021a0SBarry Smith  resI2     = 1;
d23021a0SBarry Smith  iteration = 0;
d23021a0SBarry Smith  memset(I, 0, NN * sizeof(double));
d23021a0SBarry Smith  memset(Iold, 0, NN * sizeof(double));
d23021a0SBarry Smith
d23021a0SBarry Smith  /*
d23021a0SBarry Smith   *  Sequential Jacobi Iteration.
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  Note that a RAJA ReduceSum object is used to accumulate the sum
d23021a0SBarry Smith   *  for the residual. Since the loop is run sequentially, this is
d23021a0SBarry Smith   *  not strictly necessary. It is done here for consistency and
d23021a0SBarry Smith   *  comparison with other RAJA variants in this example.
d23021a0SBarry Smith   */
d23021a0SBarry Smith  while (resI2 > tol * tol) {
9371c9d4SSatish Balay    RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
d23021a0SBarry Smith      double x = gridx.o + m * gridx.h;
d23021a0SBarry Smith      double y = gridx.o + n * gridx.h;
d23021a0SBarry Smith
9371c9d4SSatish Balay      double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
d23021a0SBarry Smith
d23021a0SBarry Smith      int id = n * (N + 2) + m;
9371c9d4SSatish Balay      I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
9371c9d4SSatish Balay    RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
d23021a0SBarry Smith      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
d23021a0SBarry Smith      Iold[k] = I[k];
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    resI2 = RAJA_resI2;
d23021a0SBarry Smith    if (iteration > maxIter) {
d23021a0SBarry Smith      printf("Jacobi: Sequential - Maxed out on iterations! \n");
d23021a0SBarry Smith      exit(-1);
d23021a0SBarry Smith    }
d23021a0SBarry Smith    iteration++;
d23021a0SBarry Smith  }
d23021a0SBarry Smith  computeErr(I, gridx);
d23021a0SBarry Smith  printf("No of iterations: %d \n \n", iteration);
d23021a0SBarry Smith
d23021a0SBarry Smith#if defined(RAJA_ENABLE_OPENMP)
d23021a0SBarry Smith  printf("RAJA: OpenMP Policy - Nested ForallN \n");
d23021a0SBarry Smith  resI2     = 1;
d23021a0SBarry Smith  iteration = 0;
d23021a0SBarry Smith  memset(I, 0, NN * sizeof(double));
d23021a0SBarry Smith  memset(Iold, 0, NN * sizeof(double));
d23021a0SBarry Smith
d23021a0SBarry Smith  /*
d23021a0SBarry Smith   *  OpenMP parallel Jacobi Iteration.
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  ----[RAJA Policies]-----------
d23021a0SBarry Smith   *  RAJA::omp_collapse_for_exec -
d23021a0SBarry Smith   *  introduced a nested region
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  Note that OpenMP RAJA ReduceSum object performs the reduction
d23021a0SBarry Smith   *  operation for the residual in a thread-safe manner.
d23021a0SBarry Smith   */
d23021a0SBarry Smith
9371c9d4SSatish Balay  using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
d23021a0SBarry Smith
d23021a0SBarry Smith  while (resI2 > tol * tol) {
9371c9d4SSatish Balay    RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
d23021a0SBarry Smith      double x = gridx.o + m * gridx.h;
d23021a0SBarry Smith      double y = gridx.o + n * gridx.h;
d23021a0SBarry Smith
9371c9d4SSatish Balay      double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
d23021a0SBarry Smith
d23021a0SBarry Smith      int id = n * (N + 2) + m;
9371c9d4SSatish Balay      I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
d23021a0SBarry Smith
9371c9d4SSatish Balay    RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
d23021a0SBarry Smith      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
d23021a0SBarry Smith      Iold[k] = I[k];
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    resI2 = RAJA_resI2;
d23021a0SBarry Smith    if (iteration > maxIter) {
d23021a0SBarry Smith      printf("Jacobi: OpenMP - Maxed out on iterations! \n");
d23021a0SBarry Smith      exit(-1);
d23021a0SBarry Smith    }
d23021a0SBarry Smith    iteration++;
d23021a0SBarry Smith  }
d23021a0SBarry Smith  computeErr(I, gridx);
d23021a0SBarry Smith  printf("No of iterations: %d \n \n", iteration);
d23021a0SBarry Smith#endif
d23021a0SBarry Smith
d23021a0SBarry Smith#if defined(RAJA_ENABLE_CUDA)
d23021a0SBarry Smith  /*
d23021a0SBarry Smith   *  CUDA Jacobi Iteration.
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  ----[RAJA Policies]-----------
d23021a0SBarry Smith   *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
d23021a0SBarry Smith   *  define the mapping of loop iterations to GPU thread blocks
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  Note that CUDA RAJA ReduceSum object performs the reduction
d23021a0SBarry Smith   *  operation for the residual in a thread-safe manner on the GPU.
d23021a0SBarry Smith   */
d23021a0SBarry Smith
d23021a0SBarry Smith  printf("RAJA: CUDA Policy - Nested ForallN \n");
d23021a0SBarry Smith
9371c9d4SSatish Balay  using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
d23021a0SBarry Smith
d23021a0SBarry Smith  resI2     = 1;
d23021a0SBarry Smith  iteration = 0;
d23021a0SBarry Smith  memset(I, 0, NN * sizeof(double));
d23021a0SBarry Smith  memset(Iold, 0, NN * sizeof(double));
d23021a0SBarry Smith
d23021a0SBarry Smith  while (resI2 > tol * tol) {
d23021a0SBarry Smith    //
d23021a0SBarry Smith    // Jacobi Iteration
d23021a0SBarry Smith    //
9371c9d4SSatish Balay    RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
d23021a0SBarry Smith      double x = gridx.o + m * gridx.h;
d23021a0SBarry Smith      double y = gridx.o + n * gridx.h;
d23021a0SBarry Smith
9371c9d4SSatish Balay      double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
d23021a0SBarry Smith
d23021a0SBarry Smith      int id = n * (N + 2) + m;
9371c9d4SSatish Balay      I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    //
d23021a0SBarry Smith    // Compute residual and update Iold
d23021a0SBarry Smith    //
d23021a0SBarry Smith    RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
9371c9d4SSatish Balay    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
d23021a0SBarry Smith      RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
d23021a0SBarry Smith      Iold[k] = I[k];
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    resI2 = RAJA_resI2;
d23021a0SBarry Smith
d23021a0SBarry Smith    if (iteration > maxIter) {
d23021a0SBarry Smith      printf("RAJA: CUDA - Maxed out on iterations! \n");
d23021a0SBarry Smith      exit(-1);
d23021a0SBarry Smith    }
d23021a0SBarry Smith    iteration++;
d23021a0SBarry Smith  }
d23021a0SBarry Smith  cudaDeviceSynchronize();
d23021a0SBarry Smith  computeErr(I, gridx);
d23021a0SBarry Smith  printf("No of iterations: %d \n \n", iteration);
d23021a0SBarry Smith#endif
d23021a0SBarry Smith
d23021a0SBarry Smith#if defined(RAJA_ENABLE_HIP)
d23021a0SBarry Smith  /*
d23021a0SBarry Smith   *  HIP Jacobi Iteration.
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  ----[RAJA Policies]-----------
d23021a0SBarry Smith   *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
d23021a0SBarry Smith   *  define the mapping of loop iterations to GPU thread blocks
d23021a0SBarry Smith   *
d23021a0SBarry Smith   *  Note that HIP RAJA ReduceSum object performs the reduction
d23021a0SBarry Smith   *  operation for the residual in a thread-safe manner on the GPU.
d23021a0SBarry Smith   */
d23021a0SBarry Smith
d23021a0SBarry Smith  printf("RAJA: HIP Policy - Nested ForallN \n");
d23021a0SBarry Smith
9371c9d4SSatish Balay  using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
d23021a0SBarry Smith
d23021a0SBarry Smith  resI2     = 1;
d23021a0SBarry Smith  iteration = 0;
d23021a0SBarry Smith  memset(I, 0, NN * sizeof(double));
d23021a0SBarry Smith  memset(Iold, 0, NN * sizeof(double));
d23021a0SBarry Smith
d23021a0SBarry Smith  double *d_I    = memoryManager::allocate_gpu<double>(NN);
d23021a0SBarry Smith  double *d_Iold = memoryManager::allocate_gpu<double>(NN);
d23021a0SBarry Smith  hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
d23021a0SBarry Smith  hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
d23021a0SBarry Smith
d23021a0SBarry Smith  while (resI2 > tol * tol) {
d23021a0SBarry Smith    //
d23021a0SBarry Smith    // Jacobi Iteration
d23021a0SBarry Smith    //
9371c9d4SSatish Balay    RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
d23021a0SBarry Smith      double x = gridx.o + m * gridx.h;
d23021a0SBarry Smith      double y = gridx.o + n * gridx.h;
d23021a0SBarry Smith
9371c9d4SSatish Balay      double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
d23021a0SBarry Smith
d23021a0SBarry Smith      int id  = n * (N + 2) + m;
9371c9d4SSatish Balay      d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    //
d23021a0SBarry Smith    // Compute residual and update Iold
d23021a0SBarry Smith    //
d23021a0SBarry Smith    RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
9371c9d4SSatish Balay    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
d23021a0SBarry Smith      RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
d23021a0SBarry Smith      d_Iold[k] = d_I[k];
d23021a0SBarry Smith    });
d23021a0SBarry Smith
d23021a0SBarry Smith    resI2 = RAJA_resI2;
d23021a0SBarry Smith
d23021a0SBarry Smith    if (iteration > maxIter) {
d23021a0SBarry Smith      printf("RAJA: HIP - Maxed out on iterations! \n");
d23021a0SBarry Smith      exit(-1);
d23021a0SBarry Smith    }
d23021a0SBarry Smith    iteration++;
d23021a0SBarry Smith  }
d23021a0SBarry Smith  hipDeviceSynchronize();
d23021a0SBarry Smith  hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
d23021a0SBarry Smith  computeErr(I, gridx);
d23021a0SBarry Smith  printf("No of iterations: %d \n \n", iteration);
d23021a0SBarry Smith
d23021a0SBarry Smith  memoryManager::deallocate_gpu(d_I);
d23021a0SBarry Smith  memoryManager::deallocate_gpu(d_Iold);
d23021a0SBarry Smith#endif
d23021a0SBarry Smith
d23021a0SBarry Smith  memoryManager::deallocate(I);
d23021a0SBarry Smith  memoryManager::deallocate(Iold);
d23021a0SBarry Smith
d23021a0SBarry Smith  return 0;
d23021a0SBarry Smith}
d23021a0SBarry Smith
d23021a0SBarry Smith//
145b44c9SPierre Jolivet// Function for the analytic solution
d23021a0SBarry Smith//
d71ae5a4SJacob Faibussowitschdouble solution(double x, double y)
d71ae5a4SJacob Faibussowitsch{
d23021a0SBarry Smith  return x * y * exp(x - y) * (1 - x) * (1 - y);
d23021a0SBarry Smith}
d23021a0SBarry Smith
d23021a0SBarry Smith//
d23021a0SBarry Smith// Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
d23021a0SBarry Smith//
d71ae5a4SJacob Faibussowitschvoid computeErr(double *I, grid_s grid)
d71ae5a4SJacob Faibussowitsch{
d23021a0SBarry Smith  RAJA::RangeSegment                        gridRange(0, grid.n);
d23021a0SBarry Smith  RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
d23021a0SBarry Smith
9371c9d4SSatish Balay  using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
d23021a0SBarry Smith
9371c9d4SSatish Balay  RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
d23021a0SBarry Smith    int    id    = tx + grid.n * ty;
d23021a0SBarry Smith    double x     = grid.o + tx * grid.h;
d23021a0SBarry Smith    double y     = grid.o + ty * grid.h;
d23021a0SBarry Smith    double myErr = std::abs(I[id] - solution(x, y));
d23021a0SBarry Smith    tMax.max(myErr);
d23021a0SBarry Smith  });
d23021a0SBarry Smith
d23021a0SBarry Smith  double l2err = tMax;
d23021a0SBarry Smith  printf("Max error = %lg, h = %f \n", l2err, grid.h);
d23021a0SBarry Smith}
d23021a0SBarry Smith
d23021a0SBarry Smith/*TEST
d23021a0SBarry Smith
d23021a0SBarry Smith    test:
d23021a0SBarry Smith      requires: raja !cuda
*d2b7c016SPierre Jolivet      filter: sed -e "/RAJA: OpenMP Policy/,+3d"
d23021a0SBarry Smith
d23021a0SBarry Smith    test:
d23021a0SBarry Smith      suffix: 2
d23021a0SBarry Smith      requires: raja cuda
*d2b7c016SPierre Jolivet      filter: sed -e "/RAJA: OpenMP Policy/,+3d"
d23021a0SBarry Smith
d23021a0SBarry SmithTEST*/