xref: /petsc/src/dm/tests/ex2k.kokkos.cxx (revision d16a9515106478b7b2e0795e6c1f226f55ebdc51)
1*d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
2*d16a9515SJunchao Zhang 
3*d16a9515SJunchao Zhang /*
4*d16a9515SJunchao Zhang   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5*d16a9515SJunchao Zhang            Time (sec.), on Mar. 1, 2022
6*d16a9515SJunchao Zhang   ------------------------------------------
7*d16a9515SJunchao Zhang   n     PETSc          C          Kokkos
8*d16a9515SJunchao Zhang   ------------------------------------------
9*d16a9515SJunchao Zhang   32    4.6464E-05  4.7451E-05  1.6880E-04
10*d16a9515SJunchao Zhang   64    2.5654E-04  2.5164E-04  5.6780E-04
11*d16a9515SJunchao Zhang   128   1.9383E-03  1.8878E-03  4.7938E-03
12*d16a9515SJunchao Zhang   256   1.4450E-02  1.3619E-02  3.7778E-02
13*d16a9515SJunchao Zhang   512   1.1580E-01  1.1551E-01  2.8428E-01
14*d16a9515SJunchao Zhang   1024  1.4179E+00  1.3772E+00  2.2873E+00
15*d16a9515SJunchao Zhang 
16*d16a9515SJunchao Zhang   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17*d16a9515SJunchao Zhang */
18*d16a9515SJunchao Zhang 
19*d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp>
20*d16a9515SJunchao Zhang #include <petscdm.h>
21*d16a9515SJunchao Zhang #include <petscdmda.h>
22*d16a9515SJunchao Zhang 
23*d16a9515SJunchao Zhang using namespace Kokkos;
24*d16a9515SJunchao Zhang using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar***,Kokkos::LayoutRight,Kokkos::HostSpace>;
25*d16a9515SJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar***, Kokkos::LayoutRight,Kokkos::HostSpace>;
26*d16a9515SJunchao Zhang 
27*d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */
28*d16a9515SJunchao Zhang static PetscErrorCode Update1(DM da,const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime)
29*d16a9515SJunchao Zhang {
30*d16a9515SJunchao Zhang   PetscErrorCode    ierr;
31*d16a9515SJunchao Zhang   PetscInt          it,i,j,k;
32*d16a9515SJunchao Zhang   PetscLogDouble    tstart=0.0,tend;
33*d16a9515SJunchao Zhang   PetscInt          xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
34*d16a9515SJunchao Zhang 
35*d16a9515SJunchao Zhang   PetscFunctionBegin;
36*d16a9515SJunchao Zhang   ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr);
37*d16a9515SJunchao Zhang   ierr = DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm);CHKERRQ(ierr);
38*d16a9515SJunchao Zhang   for (it=0; it<nwarm+nloop; it++) {
39*d16a9515SJunchao Zhang     if (it == nwarm) {ierr = PetscTime(&tstart);CHKERRQ(ierr);}
40*d16a9515SJunchao Zhang     for (k=zs; k<zs+zm; k++) {
41*d16a9515SJunchao Zhang       for (j=ys; j<ys+ym; j++) {
42*d16a9515SJunchao Zhang         for (i=xs; i<xs+xm; i++) {
43*d16a9515SJunchao Zhang           y1[k][j][i] = 6*x1[k][j][i] - x1[k-1][j][i] - x1[k][j-1][i] - x1[k][j][i-1]
44*d16a9515SJunchao Zhang                                       - x1[k+1][j][i] - x1[k][j+1][i] - x1[k][j][i+1];
45*d16a9515SJunchao Zhang         }
46*d16a9515SJunchao Zhang       }
47*d16a9515SJunchao Zhang     }
48*d16a9515SJunchao Zhang   }
49*d16a9515SJunchao Zhang   ierr    = PetscTime(&tend);CHKERRQ(ierr);
50*d16a9515SJunchao Zhang   *avgTime = (tend - tstart)/nloop;
51*d16a9515SJunchao Zhang   PetscFunctionReturn(ierr);
52*d16a9515SJunchao Zhang }
53*d16a9515SJunchao Zhang 
54*d16a9515SJunchao Zhang /* C multi-dimensional array access */
55*d16a9515SJunchao Zhang static PetscErrorCode Update2(DM da,const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime)
56*d16a9515SJunchao Zhang {
57*d16a9515SJunchao Zhang   PetscErrorCode    ierr;
58*d16a9515SJunchao Zhang   PetscInt          it,i,j,k;
59*d16a9515SJunchao Zhang   PetscLogDouble    tstart=0.0,tend;
60*d16a9515SJunchao Zhang   PetscInt          xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
61*d16a9515SJunchao Zhang 
62*d16a9515SJunchao Zhang   PetscFunctionBegin;
63*d16a9515SJunchao Zhang   ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr);
64*d16a9515SJunchao Zhang   ierr = DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm);CHKERRQ(ierr);
65*d16a9515SJunchao Zhang #define X2(k,j,i) x2[(k-gzs)*gym*gxm+(j-gys)*gxm+(i-gxs)]
66*d16a9515SJunchao Zhang #define Y2(k,j,i) y2[(k-zs)*ym*xm+(j-ys)*xm+(i-xs)]
67*d16a9515SJunchao Zhang   for (it=0; it<nwarm+nloop; it++) {
68*d16a9515SJunchao Zhang     if (it == nwarm) {ierr = PetscTime(&tstart);CHKERRQ(ierr);}
69*d16a9515SJunchao Zhang     for (k=zs; k<zs+zm; k++) {
70*d16a9515SJunchao Zhang       for (j=ys; j<ys+ym; j++) {
71*d16a9515SJunchao Zhang         for (i=xs; i<xs+xm; i++) {
72*d16a9515SJunchao Zhang           Y2(k,j,i) = 6*X2(k,j,i) - X2(k-1,j,i) - X2(k,j-1,i) - X2(k,j,i-1)
73*d16a9515SJunchao Zhang                                   - X2(k+1,j,i) - X2(k,j+1,i) - X2(k,j,i+1);
74*d16a9515SJunchao Zhang         }
75*d16a9515SJunchao Zhang       }
76*d16a9515SJunchao Zhang     }
77*d16a9515SJunchao Zhang   }
78*d16a9515SJunchao Zhang   ierr    = PetscTime(&tend);CHKERRQ(ierr);
79*d16a9515SJunchao Zhang   *avgTime = (tend - tstart)/nloop;
80*d16a9515SJunchao Zhang #undef X2
81*d16a9515SJunchao Zhang #undef Y2
82*d16a9515SJunchao Zhang   PetscFunctionReturn(ierr);
83*d16a9515SJunchao Zhang }
84*d16a9515SJunchao Zhang 
85*d16a9515SJunchao Zhang int main(int argc,char **argv)
86*d16a9515SJunchao Zhang {
87*d16a9515SJunchao Zhang   PetscErrorCode                       ierr;
88*d16a9515SJunchao Zhang   DM                                   da;
89*d16a9515SJunchao Zhang   PetscInt                             xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
90*d16a9515SJunchao Zhang   PetscInt                             dof = 1,sw = 1;
91*d16a9515SJunchao Zhang   DMBoundaryType                       bx = DM_BOUNDARY_PERIODIC,by = DM_BOUNDARY_PERIODIC,bz = DM_BOUNDARY_PERIODIC;
92*d16a9515SJunchao Zhang   DMDAStencilType                      st = DMDA_STENCIL_STAR;
93*d16a9515SJunchao Zhang   Vec                                  x,y; /* local/global vectors of the da */
94*d16a9515SJunchao Zhang   PetscRandom                          rctx;
95*d16a9515SJunchao Zhang   const PetscScalar                    ***x1;
96*d16a9515SJunchao Zhang   PetscScalar                          ***y1; /* arrays of g, ll */
97*d16a9515SJunchao Zhang   const PetscScalar                    *x2;
98*d16a9515SJunchao Zhang   PetscScalar                          *y2;
99*d16a9515SJunchao Zhang   ConstPetscScalarKokkosOffsetView3D   x3;
100*d16a9515SJunchao Zhang   PetscScalarKokkosOffsetView3D        y3;
101*d16a9515SJunchao Zhang   PetscLogDouble                       tstart = 0.0,tend = 0.0,avgTime = 0.0;
102*d16a9515SJunchao Zhang   PetscInt                             nwarm = 2, nloop = 10;
103*d16a9515SJunchao Zhang   PetscInt                             min = 32, max = 32*8; /* min and max sizes of the grids to sample */
104*d16a9515SJunchao Zhang 
105*d16a9515SJunchao Zhang   ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
106*d16a9515SJunchao Zhang   ierr = PetscRandomCreate(PETSC_COMM_WORLD,&rctx);CHKERRQ(ierr);
107*d16a9515SJunchao Zhang   ierr = PetscOptionsGetInt(NULL,NULL,"-min",&min,NULL);CHKERRQ(ierr);
108*d16a9515SJunchao Zhang   ierr = PetscOptionsGetInt(NULL,NULL,"-max",&max,NULL);CHKERRQ(ierr);
109*d16a9515SJunchao Zhang 
110*d16a9515SJunchao Zhang   for (PetscInt len=min; len<=max; len=len*2) {
111*d16a9515SJunchao Zhang     ierr = DMDACreate3d(PETSC_COMM_WORLD,bx,by,bz,st,len,len,len,PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,dof,sw,0,0,0,&da);CHKERRQ(ierr);
112*d16a9515SJunchao Zhang     ierr = DMSetFromOptions(da);CHKERRQ(ierr);
113*d16a9515SJunchao Zhang     ierr = DMSetUp(da);CHKERRQ(ierr);
114*d16a9515SJunchao Zhang 
115*d16a9515SJunchao Zhang     ierr = DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm);CHKERRQ(ierr);
116*d16a9515SJunchao Zhang     ierr = DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm);CHKERRQ(ierr);
117*d16a9515SJunchao Zhang     ierr = DMCreateLocalVector(da,&x);CHKERRQ(ierr); /* Create local x and global y */
118*d16a9515SJunchao Zhang     ierr = DMCreateGlobalVector(da,&y);CHKERRQ(ierr);
119*d16a9515SJunchao Zhang 
120*d16a9515SJunchao Zhang     /* Access with petsc multi-dimensional arrays */
121*d16a9515SJunchao Zhang     ierr = VecSetRandom(x,rctx);CHKERRQ(ierr);
122*d16a9515SJunchao Zhang     ierr = VecSet(y,0.0);CHKERRQ(ierr);
123*d16a9515SJunchao Zhang     ierr = DMDAVecGetArrayRead(da,x,&x1);CHKERRQ(ierr);
124*d16a9515SJunchao Zhang     ierr = DMDAVecGetArray(da,y,&y1);CHKERRQ(ierr);
125*d16a9515SJunchao Zhang     ierr = Update1(da,x1,y1,nwarm,nloop,&avgTime);CHKERRQ(ierr);
126*d16a9515SJunchao Zhang     ierr = DMDAVecRestoreArrayRead(da,x,&x1);CHKERRQ(ierr);
127*d16a9515SJunchao Zhang     ierr = DMDAVecRestoreArray(da,y,&y1);CHKERRQ(ierr);
128*d16a9515SJunchao Zhang     ierr = PetscTime(&tend);CHKERRQ(ierr);
129*d16a9515SJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- PETSc average time  = %e\n",len,avgTime);CHKERRQ(ierr);
130*d16a9515SJunchao Zhang 
131*d16a9515SJunchao Zhang     /* Access with C multi-dimensional arrays */
132*d16a9515SJunchao Zhang     ierr = VecSetRandom(x,rctx);CHKERRQ(ierr);
133*d16a9515SJunchao Zhang     ierr = VecSet(y,0.0);CHKERRQ(ierr);
134*d16a9515SJunchao Zhang     ierr = VecGetArrayRead(x,&x2);CHKERRQ(ierr);
135*d16a9515SJunchao Zhang     ierr = VecGetArray(y,&y2);CHKERRQ(ierr);
136*d16a9515SJunchao Zhang     ierr = Update2(da,x2,y2,nwarm,nloop,&avgTime);CHKERRQ(ierr);
137*d16a9515SJunchao Zhang     ierr = VecRestoreArrayRead(x,&x2);CHKERRQ(ierr);
138*d16a9515SJunchao Zhang     ierr = VecRestoreArray(y,&y2);CHKERRQ(ierr);
139*d16a9515SJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- C average time      = %e\n",len,avgTime);CHKERRQ(ierr);
140*d16a9515SJunchao Zhang 
141*d16a9515SJunchao Zhang     /* Access with Kokkos multi-dimensional OffsetViews */
142*d16a9515SJunchao Zhang     ierr = VecSet(y,0.0);CHKERRQ(ierr);
143*d16a9515SJunchao Zhang     ierr = VecSetRandom(x,rctx);CHKERRQ(ierr);
144*d16a9515SJunchao Zhang     ierr = DMDAVecGetKokkosOffsetView(da,x,&x3);CHKERRQ(ierr);
145*d16a9515SJunchao Zhang     ierr = DMDAVecGetKokkosOffsetView(da,y,&y3);CHKERRQ(ierr);
146*d16a9515SJunchao Zhang 
147*d16a9515SJunchao Zhang     for (PetscInt it=0; it<nwarm+nloop; it++) {
148*d16a9515SJunchao Zhang       if (it == nwarm) {ierr = PetscTime(&tstart);CHKERRQ(ierr);}
149*d16a9515SJunchao Zhang       Kokkos::parallel_for("stencil",MDRangePolicy<Kokkos::DefaultHostExecutionSpace,Rank<3,Iterate::Right,Iterate::Right>>({zs,ys,xs},{zs+zm,ys+ym,xs+xm}),
150*d16a9515SJunchao Zhang         KOKKOS_LAMBDA(PetscInt k,PetscInt j,PetscInt i) {
151*d16a9515SJunchao Zhang         y3(k,j,i) = 6*x3(k,j,i) - x3(k-1,j,i) - x3(k,j-1,i) - x3(k,j,i-1)
152*d16a9515SJunchao Zhang                                 - x3(k+1,j,i) - x3(k,j+1,i) - x3(k,j,i+1);
153*d16a9515SJunchao Zhang       });
154*d16a9515SJunchao Zhang     }
155*d16a9515SJunchao Zhang     ierr = PetscTime(&tend);CHKERRQ(ierr);
156*d16a9515SJunchao Zhang     ierr = DMDAVecRestoreKokkosOffsetView(da,x,&x3);CHKERRQ(ierr);
157*d16a9515SJunchao Zhang     ierr = DMDAVecRestoreKokkosOffsetView(da,y,&y3);CHKERRQ(ierr);
158*d16a9515SJunchao Zhang     avgTime = (tend - tstart)/nloop;
159*d16a9515SJunchao Zhang     ierr = PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- Kokkos average time = %e\n",len,avgTime);CHKERRQ(ierr);
160*d16a9515SJunchao Zhang 
161*d16a9515SJunchao Zhang     ierr = VecDestroy(&x);CHKERRQ(ierr);
162*d16a9515SJunchao Zhang     ierr = VecDestroy(&y);CHKERRQ(ierr);
163*d16a9515SJunchao Zhang     ierr = DMDestroy(&da);CHKERRQ(ierr);
164*d16a9515SJunchao Zhang   }
165*d16a9515SJunchao Zhang   ierr = PetscRandomDestroy(&rctx);CHKERRQ(ierr);
166*d16a9515SJunchao Zhang   ierr = PetscFinalize();
167*d16a9515SJunchao Zhang   return ierr;
168*d16a9515SJunchao Zhang }
169*d16a9515SJunchao Zhang 
170*d16a9515SJunchao Zhang /*TEST
171*d16a9515SJunchao Zhang   build:
172*d16a9515SJunchao Zhang     requires: kokkos_kernels
173*d16a9515SJunchao Zhang 
174*d16a9515SJunchao Zhang   test:
175*d16a9515SJunchao Zhang     suffix: 1
176*d16a9515SJunchao Zhang     requires: kokkos_kernels
177*d16a9515SJunchao Zhang     args: -min 32 -max 32 -dm_vec_type kokkos
178*d16a9515SJunchao Zhang     filter: grep -v "time"
179*d16a9515SJunchao Zhang 
180*d16a9515SJunchao Zhang TEST*/
181