1d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n"; 2d16a9515SJunchao Zhang 3d16a9515SJunchao Zhang /* 4d16a9515SJunchao Zhang On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos: 5d16a9515SJunchao Zhang Time (sec.), on Mar. 1, 2022 6d16a9515SJunchao Zhang ------------------------------------------ 7d16a9515SJunchao Zhang n PETSc C Kokkos 8d16a9515SJunchao Zhang ------------------------------------------ 9d16a9515SJunchao Zhang 32 4.6464E-05 4.7451E-05 1.6880E-04 10d16a9515SJunchao Zhang 64 2.5654E-04 2.5164E-04 5.6780E-04 11d16a9515SJunchao Zhang 128 1.9383E-03 1.8878E-03 4.7938E-03 12d16a9515SJunchao Zhang 256 1.4450E-02 1.3619E-02 3.7778E-02 13d16a9515SJunchao Zhang 512 1.1580E-01 1.1551E-01 2.8428E-01 14d16a9515SJunchao Zhang 1024 1.4179E+00 1.3772E+00 2.2873E+00 15d16a9515SJunchao Zhang 16d16a9515SJunchao Zhang Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc 17d16a9515SJunchao Zhang */ 18d16a9515SJunchao Zhang 19d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp> 20d16a9515SJunchao Zhang #include <petscdm.h> 21d16a9515SJunchao Zhang #include <petscdmda.h> 22d16a9515SJunchao Zhang 23d16a9515SJunchao Zhang using namespace Kokkos; 24d16a9515SJunchao Zhang using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar***,Kokkos::LayoutRight,Kokkos::HostSpace>; 25d16a9515SJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar***, Kokkos::LayoutRight,Kokkos::HostSpace>; 26d16a9515SJunchao Zhang 27d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */ 28d16a9515SJunchao Zhang static PetscErrorCode Update1(DM da,const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime) 29d16a9515SJunchao Zhang { 30d16a9515SJunchao Zhang PetscInt it,i,j,k; 31d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0,tend; 32d16a9515SJunchao Zhang PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 33d16a9515SJunchao Zhang 34d16a9515SJunchao Zhang PetscFunctionBegin; 35*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm)); 36*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm)); 37d16a9515SJunchao Zhang for (it=0; it<nwarm+nloop; it++) { 38*5f80ce2aSJacob Faibussowitsch if (it == nwarm) CHKERRQ(PetscTime(&tstart)); 39d16a9515SJunchao Zhang for (k=zs; k<zs+zm; k++) { 40d16a9515SJunchao Zhang for (j=ys; j<ys+ym; j++) { 41d16a9515SJunchao Zhang for (i=xs; i<xs+xm; i++) { 42d16a9515SJunchao Zhang y1[k][j][i] = 6*x1[k][j][i] - x1[k-1][j][i] - x1[k][j-1][i] - x1[k][j][i-1] 43d16a9515SJunchao Zhang - x1[k+1][j][i] - x1[k][j+1][i] - x1[k][j][i+1]; 44d16a9515SJunchao Zhang } 45d16a9515SJunchao Zhang } 46d16a9515SJunchao Zhang } 47d16a9515SJunchao Zhang } 48*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTime(&tend)); 49d16a9515SJunchao Zhang *avgTime = (tend - tstart)/nloop; 50*5f80ce2aSJacob Faibussowitsch PetscFunctionReturn(0); 51d16a9515SJunchao Zhang } 52d16a9515SJunchao Zhang 53d16a9515SJunchao Zhang /* C multi-dimensional array access */ 54d16a9515SJunchao Zhang static PetscErrorCode Update2(DM da,const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime) 55d16a9515SJunchao Zhang { 56d16a9515SJunchao Zhang PetscInt it,i,j,k; 57d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0,tend; 58d16a9515SJunchao Zhang PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 59d16a9515SJunchao Zhang 60d16a9515SJunchao Zhang PetscFunctionBegin; 61*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm)); 62*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm)); 63d16a9515SJunchao Zhang #define X2(k,j,i) x2[(k-gzs)*gym*gxm+(j-gys)*gxm+(i-gxs)] 64d16a9515SJunchao Zhang #define Y2(k,j,i) y2[(k-zs)*ym*xm+(j-ys)*xm+(i-xs)] 65d16a9515SJunchao Zhang for (it=0; it<nwarm+nloop; it++) { 66*5f80ce2aSJacob Faibussowitsch if (it == nwarm) CHKERRQ(PetscTime(&tstart)); 67d16a9515SJunchao Zhang for (k=zs; k<zs+zm; k++) { 68d16a9515SJunchao Zhang for (j=ys; j<ys+ym; j++) { 69d16a9515SJunchao Zhang for (i=xs; i<xs+xm; i++) { 70d16a9515SJunchao Zhang Y2(k,j,i) = 6*X2(k,j,i) - X2(k-1,j,i) - X2(k,j-1,i) - X2(k,j,i-1) 71d16a9515SJunchao Zhang - X2(k+1,j,i) - X2(k,j+1,i) - X2(k,j,i+1); 72d16a9515SJunchao Zhang } 73d16a9515SJunchao Zhang } 74d16a9515SJunchao Zhang } 75d16a9515SJunchao Zhang } 76*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTime(&tend)); 77d16a9515SJunchao Zhang *avgTime = (tend - tstart)/nloop; 78d16a9515SJunchao Zhang #undef X2 79d16a9515SJunchao Zhang #undef Y2 80*5f80ce2aSJacob Faibussowitsch PetscFunctionReturn(0); 81d16a9515SJunchao Zhang } 82d16a9515SJunchao Zhang 83d16a9515SJunchao Zhang int main(int argc,char **argv) 84d16a9515SJunchao Zhang { 85d16a9515SJunchao Zhang PetscErrorCode ierr; 86d16a9515SJunchao Zhang DM da; 87d16a9515SJunchao Zhang PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 88d16a9515SJunchao Zhang PetscInt dof = 1,sw = 1; 89d16a9515SJunchao Zhang DMBoundaryType bx = DM_BOUNDARY_PERIODIC,by = DM_BOUNDARY_PERIODIC,bz = DM_BOUNDARY_PERIODIC; 90d16a9515SJunchao Zhang DMDAStencilType st = DMDA_STENCIL_STAR; 91d16a9515SJunchao Zhang Vec x,y; /* local/global vectors of the da */ 92d16a9515SJunchao Zhang PetscRandom rctx; 93d16a9515SJunchao Zhang const PetscScalar ***x1; 94d16a9515SJunchao Zhang PetscScalar ***y1; /* arrays of g, ll */ 95d16a9515SJunchao Zhang const PetscScalar *x2; 96d16a9515SJunchao Zhang PetscScalar *y2; 97d16a9515SJunchao Zhang ConstPetscScalarKokkosOffsetView3D x3; 98d16a9515SJunchao Zhang PetscScalarKokkosOffsetView3D y3; 99d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0,tend = 0.0,avgTime = 0.0; 100d16a9515SJunchao Zhang PetscInt nwarm = 2, nloop = 10; 101d16a9515SJunchao Zhang PetscInt min = 32, max = 32*8; /* min and max sizes of the grids to sample */ 102d16a9515SJunchao Zhang 103d16a9515SJunchao Zhang ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 104*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscRandomCreate(PETSC_COMM_WORLD,&rctx)); 105*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-min",&min,NULL)); 106*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-max",&max,NULL)); 107d16a9515SJunchao Zhang 108d16a9515SJunchao Zhang for (PetscInt len=min; len<=max; len=len*2) { 109*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDACreate3d(PETSC_COMM_WORLD,bx,by,bz,st,len,len,len,PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,dof,sw,0,0,0,&da)); 110*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMSetFromOptions(da)); 111*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMSetUp(da)); 112d16a9515SJunchao Zhang 113*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm)); 114*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm)); 115*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMCreateLocalVector(da,&x)); /* Create local x and global y */ 116*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMCreateGlobalVector(da,&y)); 117d16a9515SJunchao Zhang 118d16a9515SJunchao Zhang /* Access with petsc multi-dimensional arrays */ 119*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetRandom(x,rctx)); 120*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSet(y,0.0)); 121*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecGetArrayRead(da,x,&x1)); 122*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecGetArray(da,y,&y1)); 123*5f80ce2aSJacob Faibussowitsch CHKERRQ(Update1(da,x1,y1,nwarm,nloop,&avgTime)); 124*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecRestoreArrayRead(da,x,&x1)); 125*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecRestoreArray(da,y,&y1)); 126*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTime(&tend)); 127*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- PETSc average time = %e\n",len,avgTime)); 128d16a9515SJunchao Zhang 129d16a9515SJunchao Zhang /* Access with C multi-dimensional arrays */ 130*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetRandom(x,rctx)); 131*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSet(y,0.0)); 132*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArrayRead(x,&x2)); 133*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecGetArray(y,&y2)); 134*5f80ce2aSJacob Faibussowitsch CHKERRQ(Update2(da,x2,y2,nwarm,nloop,&avgTime)); 135*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArrayRead(x,&x2)); 136*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecRestoreArray(y,&y2)); 137*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- C average time = %e\n",len,avgTime)); 138d16a9515SJunchao Zhang 139d16a9515SJunchao Zhang /* Access with Kokkos multi-dimensional OffsetViews */ 140*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSet(y,0.0)); 141*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecSetRandom(x,rctx)); 142*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecGetKokkosOffsetView(da,x,&x3)); 143*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecGetKokkosOffsetView(da,y,&y3)); 144d16a9515SJunchao Zhang 145d16a9515SJunchao Zhang for (PetscInt it=0; it<nwarm+nloop; it++) { 146*5f80ce2aSJacob Faibussowitsch if (it == nwarm) CHKERRQ(PetscTime(&tstart)); 147d16a9515SJunchao Zhang Kokkos::parallel_for("stencil",MDRangePolicy<Kokkos::DefaultHostExecutionSpace,Rank<3,Iterate::Right,Iterate::Right>>({zs,ys,xs},{zs+zm,ys+ym,xs+xm}), 148d16a9515SJunchao Zhang KOKKOS_LAMBDA(PetscInt k,PetscInt j,PetscInt i) { 149d16a9515SJunchao Zhang y3(k,j,i) = 6*x3(k,j,i) - x3(k-1,j,i) - x3(k,j-1,i) - x3(k,j,i-1) 150d16a9515SJunchao Zhang - x3(k+1,j,i) - x3(k,j+1,i) - x3(k,j,i+1); 151d16a9515SJunchao Zhang }); 152d16a9515SJunchao Zhang } 153*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTime(&tend)); 154*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecRestoreKokkosOffsetView(da,x,&x3)); 155*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDAVecRestoreKokkosOffsetView(da,y,&y3)); 156d16a9515SJunchao Zhang avgTime = (tend - tstart)/nloop; 157*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- Kokkos average time = %e\n",len,avgTime)); 158d16a9515SJunchao Zhang 159*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&x)); 160*5f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&y)); 161*5f80ce2aSJacob Faibussowitsch CHKERRQ(DMDestroy(&da)); 162d16a9515SJunchao Zhang } 163*5f80ce2aSJacob Faibussowitsch CHKERRQ(PetscRandomDestroy(&rctx)); 164d16a9515SJunchao Zhang ierr = PetscFinalize(); 165d16a9515SJunchao Zhang return ierr; 166d16a9515SJunchao Zhang } 167d16a9515SJunchao Zhang 168d16a9515SJunchao Zhang /*TEST 169d16a9515SJunchao Zhang build: 170d16a9515SJunchao Zhang requires: kokkos_kernels 171d16a9515SJunchao Zhang 172d16a9515SJunchao Zhang test: 173d16a9515SJunchao Zhang suffix: 1 174d16a9515SJunchao Zhang requires: kokkos_kernels 175d16a9515SJunchao Zhang args: -min 32 -max 32 -dm_vec_type kokkos 176d16a9515SJunchao Zhang filter: grep -v "time" 177d16a9515SJunchao Zhang 178d16a9515SJunchao Zhang TEST*/ 179