1*d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n"; 2*d16a9515SJunchao Zhang 3*d16a9515SJunchao Zhang /* 4*d16a9515SJunchao Zhang On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos: 5*d16a9515SJunchao Zhang Time (sec.), on Mar. 1, 2022 6*d16a9515SJunchao Zhang ------------------------------------------ 7*d16a9515SJunchao Zhang n PETSc C Kokkos 8*d16a9515SJunchao Zhang ------------------------------------------ 9*d16a9515SJunchao Zhang 32 4.6464E-05 4.7451E-05 1.6880E-04 10*d16a9515SJunchao Zhang 64 2.5654E-04 2.5164E-04 5.6780E-04 11*d16a9515SJunchao Zhang 128 1.9383E-03 1.8878E-03 4.7938E-03 12*d16a9515SJunchao Zhang 256 1.4450E-02 1.3619E-02 3.7778E-02 13*d16a9515SJunchao Zhang 512 1.1580E-01 1.1551E-01 2.8428E-01 14*d16a9515SJunchao Zhang 1024 1.4179E+00 1.3772E+00 2.2873E+00 15*d16a9515SJunchao Zhang 16*d16a9515SJunchao Zhang Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc 17*d16a9515SJunchao Zhang */ 18*d16a9515SJunchao Zhang 19*d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp> 20*d16a9515SJunchao Zhang #include <petscdm.h> 21*d16a9515SJunchao Zhang #include <petscdmda.h> 22*d16a9515SJunchao Zhang 23*d16a9515SJunchao Zhang using namespace Kokkos; 24*d16a9515SJunchao Zhang using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar***,Kokkos::LayoutRight,Kokkos::HostSpace>; 25*d16a9515SJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar***, Kokkos::LayoutRight,Kokkos::HostSpace>; 26*d16a9515SJunchao Zhang 27*d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */ 28*d16a9515SJunchao Zhang static PetscErrorCode Update1(DM da,const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime) 29*d16a9515SJunchao Zhang { 30*d16a9515SJunchao Zhang PetscErrorCode ierr; 31*d16a9515SJunchao Zhang PetscInt it,i,j,k; 32*d16a9515SJunchao Zhang PetscLogDouble tstart=0.0,tend; 33*d16a9515SJunchao Zhang PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 34*d16a9515SJunchao Zhang 35*d16a9515SJunchao Zhang PetscFunctionBegin; 36*d16a9515SJunchao Zhang ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr); 37*d16a9515SJunchao Zhang ierr = DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm);CHKERRQ(ierr); 38*d16a9515SJunchao Zhang for (it=0; it<nwarm+nloop; it++) { 39*d16a9515SJunchao Zhang if (it == nwarm) {ierr = PetscTime(&tstart);CHKERRQ(ierr);} 40*d16a9515SJunchao Zhang for (k=zs; k<zs+zm; k++) { 41*d16a9515SJunchao Zhang for (j=ys; j<ys+ym; j++) { 42*d16a9515SJunchao Zhang for (i=xs; i<xs+xm; i++) { 43*d16a9515SJunchao Zhang y1[k][j][i] = 6*x1[k][j][i] - x1[k-1][j][i] - x1[k][j-1][i] - x1[k][j][i-1] 44*d16a9515SJunchao Zhang - x1[k+1][j][i] - x1[k][j+1][i] - x1[k][j][i+1]; 45*d16a9515SJunchao Zhang } 46*d16a9515SJunchao Zhang } 47*d16a9515SJunchao Zhang } 48*d16a9515SJunchao Zhang } 49*d16a9515SJunchao Zhang ierr = PetscTime(&tend);CHKERRQ(ierr); 50*d16a9515SJunchao Zhang *avgTime = (tend - tstart)/nloop; 51*d16a9515SJunchao Zhang PetscFunctionReturn(ierr); 52*d16a9515SJunchao Zhang } 53*d16a9515SJunchao Zhang 54*d16a9515SJunchao Zhang /* C multi-dimensional array access */ 55*d16a9515SJunchao Zhang static PetscErrorCode Update2(DM da,const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime) 56*d16a9515SJunchao Zhang { 57*d16a9515SJunchao Zhang PetscErrorCode ierr; 58*d16a9515SJunchao Zhang PetscInt it,i,j,k; 59*d16a9515SJunchao Zhang PetscLogDouble tstart=0.0,tend; 60*d16a9515SJunchao Zhang PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 61*d16a9515SJunchao Zhang 62*d16a9515SJunchao Zhang PetscFunctionBegin; 63*d16a9515SJunchao Zhang ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr); 64*d16a9515SJunchao Zhang ierr = DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm);CHKERRQ(ierr); 65*d16a9515SJunchao Zhang #define X2(k,j,i) x2[(k-gzs)*gym*gxm+(j-gys)*gxm+(i-gxs)] 66*d16a9515SJunchao Zhang #define Y2(k,j,i) y2[(k-zs)*ym*xm+(j-ys)*xm+(i-xs)] 67*d16a9515SJunchao Zhang for (it=0; it<nwarm+nloop; it++) { 68*d16a9515SJunchao Zhang if (it == nwarm) {ierr = PetscTime(&tstart);CHKERRQ(ierr);} 69*d16a9515SJunchao Zhang for (k=zs; k<zs+zm; k++) { 70*d16a9515SJunchao Zhang for (j=ys; j<ys+ym; j++) { 71*d16a9515SJunchao Zhang for (i=xs; i<xs+xm; i++) { 72*d16a9515SJunchao Zhang Y2(k,j,i) = 6*X2(k,j,i) - X2(k-1,j,i) - X2(k,j-1,i) - X2(k,j,i-1) 73*d16a9515SJunchao Zhang - X2(k+1,j,i) - X2(k,j+1,i) - X2(k,j,i+1); 74*d16a9515SJunchao Zhang } 75*d16a9515SJunchao Zhang } 76*d16a9515SJunchao Zhang } 77*d16a9515SJunchao Zhang } 78*d16a9515SJunchao Zhang ierr = PetscTime(&tend);CHKERRQ(ierr); 79*d16a9515SJunchao Zhang *avgTime = (tend - tstart)/nloop; 80*d16a9515SJunchao Zhang #undef X2 81*d16a9515SJunchao Zhang #undef Y2 82*d16a9515SJunchao Zhang PetscFunctionReturn(ierr); 83*d16a9515SJunchao Zhang } 84*d16a9515SJunchao Zhang 85*d16a9515SJunchao Zhang int main(int argc,char **argv) 86*d16a9515SJunchao Zhang { 87*d16a9515SJunchao Zhang PetscErrorCode ierr; 88*d16a9515SJunchao Zhang DM da; 89*d16a9515SJunchao Zhang PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 90*d16a9515SJunchao Zhang PetscInt dof = 1,sw = 1; 91*d16a9515SJunchao Zhang DMBoundaryType bx = DM_BOUNDARY_PERIODIC,by = DM_BOUNDARY_PERIODIC,bz = DM_BOUNDARY_PERIODIC; 92*d16a9515SJunchao Zhang DMDAStencilType st = DMDA_STENCIL_STAR; 93*d16a9515SJunchao Zhang Vec x,y; /* local/global vectors of the da */ 94*d16a9515SJunchao Zhang PetscRandom rctx; 95*d16a9515SJunchao Zhang const PetscScalar ***x1; 96*d16a9515SJunchao Zhang PetscScalar ***y1; /* arrays of g, ll */ 97*d16a9515SJunchao Zhang const PetscScalar *x2; 98*d16a9515SJunchao Zhang PetscScalar *y2; 99*d16a9515SJunchao Zhang ConstPetscScalarKokkosOffsetView3D x3; 100*d16a9515SJunchao Zhang PetscScalarKokkosOffsetView3D y3; 101*d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0,tend = 0.0,avgTime = 0.0; 102*d16a9515SJunchao Zhang PetscInt nwarm = 2, nloop = 10; 103*d16a9515SJunchao Zhang PetscInt min = 32, max = 32*8; /* min and max sizes of the grids to sample */ 104*d16a9515SJunchao Zhang 105*d16a9515SJunchao Zhang ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 106*d16a9515SJunchao Zhang ierr = PetscRandomCreate(PETSC_COMM_WORLD,&rctx);CHKERRQ(ierr); 107*d16a9515SJunchao Zhang ierr = PetscOptionsGetInt(NULL,NULL,"-min",&min,NULL);CHKERRQ(ierr); 108*d16a9515SJunchao Zhang ierr = PetscOptionsGetInt(NULL,NULL,"-max",&max,NULL);CHKERRQ(ierr); 109*d16a9515SJunchao Zhang 110*d16a9515SJunchao Zhang for (PetscInt len=min; len<=max; len=len*2) { 111*d16a9515SJunchao Zhang ierr = DMDACreate3d(PETSC_COMM_WORLD,bx,by,bz,st,len,len,len,PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,dof,sw,0,0,0,&da);CHKERRQ(ierr); 112*d16a9515SJunchao Zhang ierr = DMSetFromOptions(da);CHKERRQ(ierr); 113*d16a9515SJunchao Zhang ierr = DMSetUp(da);CHKERRQ(ierr); 114*d16a9515SJunchao Zhang 115*d16a9515SJunchao Zhang ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr); 116*d16a9515SJunchao Zhang ierr = DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm);CHKERRQ(ierr); 117*d16a9515SJunchao Zhang ierr = DMCreateLocalVector(da,&x);CHKERRQ(ierr); /* Create local x and global y */ 118*d16a9515SJunchao Zhang ierr = DMCreateGlobalVector(da,&y);CHKERRQ(ierr); 119*d16a9515SJunchao Zhang 120*d16a9515SJunchao Zhang /* Access with petsc multi-dimensional arrays */ 121*d16a9515SJunchao Zhang ierr = VecSetRandom(x,rctx);CHKERRQ(ierr); 122*d16a9515SJunchao Zhang ierr = VecSet(y,0.0);CHKERRQ(ierr); 123*d16a9515SJunchao Zhang ierr = DMDAVecGetArrayRead(da,x,&x1);CHKERRQ(ierr); 124*d16a9515SJunchao Zhang ierr = DMDAVecGetArray(da,y,&y1);CHKERRQ(ierr); 125*d16a9515SJunchao Zhang ierr = Update1(da,x1,y1,nwarm,nloop,&avgTime);CHKERRQ(ierr); 126*d16a9515SJunchao Zhang ierr = DMDAVecRestoreArrayRead(da,x,&x1);CHKERRQ(ierr); 127*d16a9515SJunchao Zhang ierr = DMDAVecRestoreArray(da,y,&y1);CHKERRQ(ierr); 128*d16a9515SJunchao Zhang ierr = PetscTime(&tend);CHKERRQ(ierr); 129*d16a9515SJunchao Zhang ierr = PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- PETSc average time = %e\n",len,avgTime);CHKERRQ(ierr); 130*d16a9515SJunchao Zhang 131*d16a9515SJunchao Zhang /* Access with C multi-dimensional arrays */ 132*d16a9515SJunchao Zhang ierr = VecSetRandom(x,rctx);CHKERRQ(ierr); 133*d16a9515SJunchao Zhang ierr = VecSet(y,0.0);CHKERRQ(ierr); 134*d16a9515SJunchao Zhang ierr = VecGetArrayRead(x,&x2);CHKERRQ(ierr); 135*d16a9515SJunchao Zhang ierr = VecGetArray(y,&y2);CHKERRQ(ierr); 136*d16a9515SJunchao Zhang ierr = Update2(da,x2,y2,nwarm,nloop,&avgTime);CHKERRQ(ierr); 137*d16a9515SJunchao Zhang ierr = VecRestoreArrayRead(x,&x2);CHKERRQ(ierr); 138*d16a9515SJunchao Zhang ierr = VecRestoreArray(y,&y2);CHKERRQ(ierr); 139*d16a9515SJunchao Zhang ierr = PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- C average time = %e\n",len,avgTime);CHKERRQ(ierr); 140*d16a9515SJunchao Zhang 141*d16a9515SJunchao Zhang /* Access with Kokkos multi-dimensional OffsetViews */ 142*d16a9515SJunchao Zhang ierr = VecSet(y,0.0);CHKERRQ(ierr); 143*d16a9515SJunchao Zhang ierr = VecSetRandom(x,rctx);CHKERRQ(ierr); 144*d16a9515SJunchao Zhang ierr = DMDAVecGetKokkosOffsetView(da,x,&x3);CHKERRQ(ierr); 145*d16a9515SJunchao Zhang ierr = DMDAVecGetKokkosOffsetView(da,y,&y3);CHKERRQ(ierr); 146*d16a9515SJunchao Zhang 147*d16a9515SJunchao Zhang for (PetscInt it=0; it<nwarm+nloop; it++) { 148*d16a9515SJunchao Zhang if (it == nwarm) {ierr = PetscTime(&tstart);CHKERRQ(ierr);} 149*d16a9515SJunchao Zhang Kokkos::parallel_for("stencil",MDRangePolicy<Kokkos::DefaultHostExecutionSpace,Rank<3,Iterate::Right,Iterate::Right>>({zs,ys,xs},{zs+zm,ys+ym,xs+xm}), 150*d16a9515SJunchao Zhang KOKKOS_LAMBDA(PetscInt k,PetscInt j,PetscInt i) { 151*d16a9515SJunchao Zhang y3(k,j,i) = 6*x3(k,j,i) - x3(k-1,j,i) - x3(k,j-1,i) - x3(k,j,i-1) 152*d16a9515SJunchao Zhang - x3(k+1,j,i) - x3(k,j+1,i) - x3(k,j,i+1); 153*d16a9515SJunchao Zhang }); 154*d16a9515SJunchao Zhang } 155*d16a9515SJunchao Zhang ierr = PetscTime(&tend);CHKERRQ(ierr); 156*d16a9515SJunchao Zhang ierr = DMDAVecRestoreKokkosOffsetView(da,x,&x3);CHKERRQ(ierr); 157*d16a9515SJunchao Zhang ierr = DMDAVecRestoreKokkosOffsetView(da,y,&y3);CHKERRQ(ierr); 158*d16a9515SJunchao Zhang avgTime = (tend - tstart)/nloop; 159*d16a9515SJunchao Zhang ierr = PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- Kokkos average time = %e\n",len,avgTime);CHKERRQ(ierr); 160*d16a9515SJunchao Zhang 161*d16a9515SJunchao Zhang ierr = VecDestroy(&x);CHKERRQ(ierr); 162*d16a9515SJunchao Zhang ierr = VecDestroy(&y);CHKERRQ(ierr); 163*d16a9515SJunchao Zhang ierr = DMDestroy(&da);CHKERRQ(ierr); 164*d16a9515SJunchao Zhang } 165*d16a9515SJunchao Zhang ierr = PetscRandomDestroy(&rctx);CHKERRQ(ierr); 166*d16a9515SJunchao Zhang ierr = PetscFinalize(); 167*d16a9515SJunchao Zhang return ierr; 168*d16a9515SJunchao Zhang } 169*d16a9515SJunchao Zhang 170*d16a9515SJunchao Zhang /*TEST 171*d16a9515SJunchao Zhang build: 172*d16a9515SJunchao Zhang requires: kokkos_kernels 173*d16a9515SJunchao Zhang 174*d16a9515SJunchao Zhang test: 175*d16a9515SJunchao Zhang suffix: 1 176*d16a9515SJunchao Zhang requires: kokkos_kernels 177*d16a9515SJunchao Zhang args: -min 32 -max 32 -dm_vec_type kokkos 178*d16a9515SJunchao Zhang filter: grep -v "time" 179*d16a9515SJunchao Zhang 180*d16a9515SJunchao Zhang TEST*/ 181