1 static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n"; 2 3 /* 4 On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos: 5 Time (sec.), on Mar. 1, 2022 6 ------------------------------------------ 7 n PETSc C Kokkos 8 ------------------------------------------ 9 32 4.6464E-05 4.7451E-05 1.6880E-04 10 64 2.5654E-04 2.5164E-04 5.6780E-04 11 128 1.9383E-03 1.8878E-03 4.7938E-03 12 256 1.4450E-02 1.3619E-02 3.7778E-02 13 512 1.1580E-01 1.1551E-01 2.8428E-01 14 1024 1.4179E+00 1.3772E+00 2.2873E+00 15 16 Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc 17 */ 18 19 #include <petscdmda_kokkos.hpp> 20 #include <petscdm.h> 21 #include <petscdmda.h> 22 23 using namespace Kokkos; 24 using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar***,Kokkos::LayoutRight,Kokkos::HostSpace>; 25 using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar***, Kokkos::LayoutRight,Kokkos::HostSpace>; 26 27 /* PETSc multi-dimensional array access */ 28 static PetscErrorCode Update1(DM da,const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime) 29 { 30 PetscInt it,i,j,k; 31 PetscLogDouble tstart = 0.0,tend; 32 PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 33 34 PetscFunctionBegin; 35 CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm)); 36 CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm)); 37 for (it=0; it<nwarm+nloop; it++) { 38 if (it == nwarm) CHKERRQ(PetscTime(&tstart)); 39 for (k=zs; k<zs+zm; k++) { 40 for (j=ys; j<ys+ym; j++) { 41 for (i=xs; i<xs+xm; i++) { 42 y1[k][j][i] = 6*x1[k][j][i] - x1[k-1][j][i] - x1[k][j-1][i] - x1[k][j][i-1] 43 - x1[k+1][j][i] - x1[k][j+1][i] - x1[k][j][i+1]; 44 } 45 } 46 } 47 } 48 CHKERRQ(PetscTime(&tend)); 49 *avgTime = (tend - tstart)/nloop; 50 PetscFunctionReturn(0); 51 } 52 53 /* C multi-dimensional array access */ 54 static PetscErrorCode Update2(DM da,const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime) 55 { 56 PetscInt it,i,j,k; 57 PetscLogDouble tstart = 0.0,tend; 58 PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 59 60 PetscFunctionBegin; 61 CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm)); 62 CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm)); 63 #define X2(k,j,i) x2[(k-gzs)*gym*gxm+(j-gys)*gxm+(i-gxs)] 64 #define Y2(k,j,i) y2[(k-zs)*ym*xm+(j-ys)*xm+(i-xs)] 65 for (it=0; it<nwarm+nloop; it++) { 66 if (it == nwarm) CHKERRQ(PetscTime(&tstart)); 67 for (k=zs; k<zs+zm; k++) { 68 for (j=ys; j<ys+ym; j++) { 69 for (i=xs; i<xs+xm; i++) { 70 Y2(k,j,i) = 6*X2(k,j,i) - X2(k-1,j,i) - X2(k,j-1,i) - X2(k,j,i-1) 71 - X2(k+1,j,i) - X2(k,j+1,i) - X2(k,j,i+1); 72 } 73 } 74 } 75 } 76 CHKERRQ(PetscTime(&tend)); 77 *avgTime = (tend - tstart)/nloop; 78 #undef X2 79 #undef Y2 80 PetscFunctionReturn(0); 81 } 82 83 int main(int argc,char **argv) 84 { 85 PetscErrorCode ierr; 86 DM da; 87 PetscInt xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs; 88 PetscInt dof = 1,sw = 1; 89 DMBoundaryType bx = DM_BOUNDARY_PERIODIC,by = DM_BOUNDARY_PERIODIC,bz = DM_BOUNDARY_PERIODIC; 90 DMDAStencilType st = DMDA_STENCIL_STAR; 91 Vec x,y; /* local/global vectors of the da */ 92 PetscRandom rctx; 93 const PetscScalar ***x1; 94 PetscScalar ***y1; /* arrays of g, ll */ 95 const PetscScalar *x2; 96 PetscScalar *y2; 97 ConstPetscScalarKokkosOffsetView3D x3; 98 PetscScalarKokkosOffsetView3D y3; 99 PetscLogDouble tstart = 0.0,tend = 0.0,avgTime = 0.0; 100 PetscInt nwarm = 2, nloop = 10; 101 PetscInt min = 32, max = 32*8; /* min and max sizes of the grids to sample */ 102 103 ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr; 104 CHKERRQ(PetscRandomCreate(PETSC_COMM_WORLD,&rctx)); 105 CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-min",&min,NULL)); 106 CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-max",&max,NULL)); 107 108 for (PetscInt len=min; len<=max; len=len*2) { 109 CHKERRQ(DMDACreate3d(PETSC_COMM_WORLD,bx,by,bz,st,len,len,len,PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,dof,sw,0,0,0,&da)); 110 CHKERRQ(DMSetFromOptions(da)); 111 CHKERRQ(DMSetUp(da)); 112 113 CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm)); 114 CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm)); 115 CHKERRQ(DMCreateLocalVector(da,&x)); /* Create local x and global y */ 116 CHKERRQ(DMCreateGlobalVector(da,&y)); 117 118 /* Access with petsc multi-dimensional arrays */ 119 CHKERRQ(VecSetRandom(x,rctx)); 120 CHKERRQ(VecSet(y,0.0)); 121 CHKERRQ(DMDAVecGetArrayRead(da,x,&x1)); 122 CHKERRQ(DMDAVecGetArray(da,y,&y1)); 123 CHKERRQ(Update1(da,x1,y1,nwarm,nloop,&avgTime)); 124 CHKERRQ(DMDAVecRestoreArrayRead(da,x,&x1)); 125 CHKERRQ(DMDAVecRestoreArray(da,y,&y1)); 126 CHKERRQ(PetscTime(&tend)); 127 CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- PETSc average time = %e\n",len,avgTime)); 128 129 /* Access with C multi-dimensional arrays */ 130 CHKERRQ(VecSetRandom(x,rctx)); 131 CHKERRQ(VecSet(y,0.0)); 132 CHKERRQ(VecGetArrayRead(x,&x2)); 133 CHKERRQ(VecGetArray(y,&y2)); 134 CHKERRQ(Update2(da,x2,y2,nwarm,nloop,&avgTime)); 135 CHKERRQ(VecRestoreArrayRead(x,&x2)); 136 CHKERRQ(VecRestoreArray(y,&y2)); 137 CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- C average time = %e\n",len,avgTime)); 138 139 /* Access with Kokkos multi-dimensional OffsetViews */ 140 CHKERRQ(VecSet(y,0.0)); 141 CHKERRQ(VecSetRandom(x,rctx)); 142 CHKERRQ(DMDAVecGetKokkosOffsetView(da,x,&x3)); 143 CHKERRQ(DMDAVecGetKokkosOffsetView(da,y,&y3)); 144 145 for (PetscInt it=0; it<nwarm+nloop; it++) { 146 if (it == nwarm) CHKERRQ(PetscTime(&tstart)); 147 Kokkos::parallel_for("stencil",MDRangePolicy<Kokkos::DefaultHostExecutionSpace,Rank<3,Iterate::Right,Iterate::Right>>({zs,ys,xs},{zs+zm,ys+ym,xs+xm}), 148 KOKKOS_LAMBDA(PetscInt k,PetscInt j,PetscInt i) { 149 y3(k,j,i) = 6*x3(k,j,i) - x3(k-1,j,i) - x3(k,j-1,i) - x3(k,j,i-1) 150 - x3(k+1,j,i) - x3(k,j+1,i) - x3(k,j,i+1); 151 }); 152 } 153 CHKERRQ(PetscTime(&tend)); 154 CHKERRQ(DMDAVecRestoreKokkosOffsetView(da,x,&x3)); 155 CHKERRQ(DMDAVecRestoreKokkosOffsetView(da,y,&y3)); 156 avgTime = (tend - tstart)/nloop; 157 CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- Kokkos average time = %e\n",len,avgTime)); 158 159 CHKERRQ(VecDestroy(&x)); 160 CHKERRQ(VecDestroy(&y)); 161 CHKERRQ(DMDestroy(&da)); 162 } 163 CHKERRQ(PetscRandomDestroy(&rctx)); 164 ierr = PetscFinalize(); 165 return ierr; 166 } 167 168 /*TEST 169 build: 170 requires: kokkos_kernels 171 172 test: 173 suffix: 1 174 requires: kokkos_kernels 175 args: -min 32 -max 32 -dm_vec_type kokkos 176 filter: grep -v "time" 177 178 TEST*/ 179