xref: /petsc/src/dm/tests/ex2k.kokkos.cxx (revision 5f80ce2ab25dff0f4601e710601cbbcecf323266)
1d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
2d16a9515SJunchao Zhang 
3d16a9515SJunchao Zhang /*
4d16a9515SJunchao Zhang   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5d16a9515SJunchao Zhang            Time (sec.), on Mar. 1, 2022
6d16a9515SJunchao Zhang   ------------------------------------------
7d16a9515SJunchao Zhang   n     PETSc          C          Kokkos
8d16a9515SJunchao Zhang   ------------------------------------------
9d16a9515SJunchao Zhang   32    4.6464E-05  4.7451E-05  1.6880E-04
10d16a9515SJunchao Zhang   64    2.5654E-04  2.5164E-04  5.6780E-04
11d16a9515SJunchao Zhang   128   1.9383E-03  1.8878E-03  4.7938E-03
12d16a9515SJunchao Zhang   256   1.4450E-02  1.3619E-02  3.7778E-02
13d16a9515SJunchao Zhang   512   1.1580E-01  1.1551E-01  2.8428E-01
14d16a9515SJunchao Zhang   1024  1.4179E+00  1.3772E+00  2.2873E+00
15d16a9515SJunchao Zhang 
16d16a9515SJunchao Zhang   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17d16a9515SJunchao Zhang */
18d16a9515SJunchao Zhang 
19d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp>
20d16a9515SJunchao Zhang #include <petscdm.h>
21d16a9515SJunchao Zhang #include <petscdmda.h>
22d16a9515SJunchao Zhang 
23d16a9515SJunchao Zhang using namespace Kokkos;
24d16a9515SJunchao Zhang using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar***,Kokkos::LayoutRight,Kokkos::HostSpace>;
25d16a9515SJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar***, Kokkos::LayoutRight,Kokkos::HostSpace>;
26d16a9515SJunchao Zhang 
27d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */
28d16a9515SJunchao Zhang static PetscErrorCode Update1(DM da,const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime)
29d16a9515SJunchao Zhang {
30d16a9515SJunchao Zhang   PetscInt       it,i,j,k;
31d16a9515SJunchao Zhang   PetscLogDouble tstart = 0.0,tend;
32d16a9515SJunchao Zhang   PetscInt       xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
33d16a9515SJunchao Zhang 
34d16a9515SJunchao Zhang   PetscFunctionBegin;
35*5f80ce2aSJacob Faibussowitsch   CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm));
36*5f80ce2aSJacob Faibussowitsch   CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm));
37d16a9515SJunchao Zhang   for (it=0; it<nwarm+nloop; it++) {
38*5f80ce2aSJacob Faibussowitsch     if (it == nwarm) CHKERRQ(PetscTime(&tstart));
39d16a9515SJunchao Zhang     for (k=zs; k<zs+zm; k++) {
40d16a9515SJunchao Zhang       for (j=ys; j<ys+ym; j++) {
41d16a9515SJunchao Zhang         for (i=xs; i<xs+xm; i++) {
42d16a9515SJunchao Zhang           y1[k][j][i] = 6*x1[k][j][i] - x1[k-1][j][i] - x1[k][j-1][i] - x1[k][j][i-1]
43d16a9515SJunchao Zhang                                       - x1[k+1][j][i] - x1[k][j+1][i] - x1[k][j][i+1];
44d16a9515SJunchao Zhang         }
45d16a9515SJunchao Zhang       }
46d16a9515SJunchao Zhang     }
47d16a9515SJunchao Zhang   }
48*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTime(&tend));
49d16a9515SJunchao Zhang   *avgTime = (tend - tstart)/nloop;
50*5f80ce2aSJacob Faibussowitsch   PetscFunctionReturn(0);
51d16a9515SJunchao Zhang }
52d16a9515SJunchao Zhang 
53d16a9515SJunchao Zhang /* C multi-dimensional array access */
54d16a9515SJunchao Zhang static PetscErrorCode Update2(DM da,const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime)
55d16a9515SJunchao Zhang {
56d16a9515SJunchao Zhang   PetscInt       it,i,j,k;
57d16a9515SJunchao Zhang   PetscLogDouble tstart = 0.0,tend;
58d16a9515SJunchao Zhang   PetscInt       xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
59d16a9515SJunchao Zhang 
60d16a9515SJunchao Zhang   PetscFunctionBegin;
61*5f80ce2aSJacob Faibussowitsch   CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm));
62*5f80ce2aSJacob Faibussowitsch   CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm));
63d16a9515SJunchao Zhang #define X2(k,j,i) x2[(k-gzs)*gym*gxm+(j-gys)*gxm+(i-gxs)]
64d16a9515SJunchao Zhang #define Y2(k,j,i) y2[(k-zs)*ym*xm+(j-ys)*xm+(i-xs)]
65d16a9515SJunchao Zhang   for (it=0; it<nwarm+nloop; it++) {
66*5f80ce2aSJacob Faibussowitsch     if (it == nwarm) CHKERRQ(PetscTime(&tstart));
67d16a9515SJunchao Zhang     for (k=zs; k<zs+zm; k++) {
68d16a9515SJunchao Zhang       for (j=ys; j<ys+ym; j++) {
69d16a9515SJunchao Zhang         for (i=xs; i<xs+xm; i++) {
70d16a9515SJunchao Zhang           Y2(k,j,i) = 6*X2(k,j,i) - X2(k-1,j,i) - X2(k,j-1,i) - X2(k,j,i-1)
71d16a9515SJunchao Zhang                                   - X2(k+1,j,i) - X2(k,j+1,i) - X2(k,j,i+1);
72d16a9515SJunchao Zhang         }
73d16a9515SJunchao Zhang       }
74d16a9515SJunchao Zhang     }
75d16a9515SJunchao Zhang   }
76*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTime(&tend));
77d16a9515SJunchao Zhang   *avgTime = (tend - tstart)/nloop;
78d16a9515SJunchao Zhang #undef X2
79d16a9515SJunchao Zhang #undef Y2
80*5f80ce2aSJacob Faibussowitsch   PetscFunctionReturn(0);
81d16a9515SJunchao Zhang }
82d16a9515SJunchao Zhang 
83d16a9515SJunchao Zhang int main(int argc,char **argv)
84d16a9515SJunchao Zhang {
85d16a9515SJunchao Zhang   PetscErrorCode                       ierr;
86d16a9515SJunchao Zhang   DM                                   da;
87d16a9515SJunchao Zhang   PetscInt                             xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
88d16a9515SJunchao Zhang   PetscInt                             dof = 1,sw = 1;
89d16a9515SJunchao Zhang   DMBoundaryType                       bx = DM_BOUNDARY_PERIODIC,by = DM_BOUNDARY_PERIODIC,bz = DM_BOUNDARY_PERIODIC;
90d16a9515SJunchao Zhang   DMDAStencilType                      st = DMDA_STENCIL_STAR;
91d16a9515SJunchao Zhang   Vec                                  x,y; /* local/global vectors of the da */
92d16a9515SJunchao Zhang   PetscRandom                          rctx;
93d16a9515SJunchao Zhang   const PetscScalar                    ***x1;
94d16a9515SJunchao Zhang   PetscScalar                          ***y1; /* arrays of g, ll */
95d16a9515SJunchao Zhang   const PetscScalar                    *x2;
96d16a9515SJunchao Zhang   PetscScalar                          *y2;
97d16a9515SJunchao Zhang   ConstPetscScalarKokkosOffsetView3D   x3;
98d16a9515SJunchao Zhang   PetscScalarKokkosOffsetView3D        y3;
99d16a9515SJunchao Zhang   PetscLogDouble                       tstart = 0.0,tend = 0.0,avgTime = 0.0;
100d16a9515SJunchao Zhang   PetscInt                             nwarm = 2, nloop = 10;
101d16a9515SJunchao Zhang   PetscInt                             min = 32, max = 32*8; /* min and max sizes of the grids to sample */
102d16a9515SJunchao Zhang 
103d16a9515SJunchao Zhang   ierr = PetscInitialize(&argc,&argv,(char*)0,help);if (ierr) return ierr;
104*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscRandomCreate(PETSC_COMM_WORLD,&rctx));
105*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-min",&min,NULL));
106*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsGetInt(NULL,NULL,"-max",&max,NULL));
107d16a9515SJunchao Zhang 
108d16a9515SJunchao Zhang   for (PetscInt len=min; len<=max; len=len*2) {
109*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDACreate3d(PETSC_COMM_WORLD,bx,by,bz,st,len,len,len,PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,dof,sw,0,0,0,&da));
110*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMSetFromOptions(da));
111*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMSetUp(da));
112d16a9515SJunchao Zhang 
113*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm));
114*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm));
115*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMCreateLocalVector(da,&x)); /* Create local x and global y */
116*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMCreateGlobalVector(da,&y));
117d16a9515SJunchao Zhang 
118d16a9515SJunchao Zhang     /* Access with petsc multi-dimensional arrays */
119*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecSetRandom(x,rctx));
120*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecSet(y,0.0));
121*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecGetArrayRead(da,x,&x1));
122*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecGetArray(da,y,&y1));
123*5f80ce2aSJacob Faibussowitsch     CHKERRQ(Update1(da,x1,y1,nwarm,nloop,&avgTime));
124*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecRestoreArrayRead(da,x,&x1));
125*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecRestoreArray(da,y,&y1));
126*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscTime(&tend));
127*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- PETSc average time  = %e\n",len,avgTime));
128d16a9515SJunchao Zhang 
129d16a9515SJunchao Zhang     /* Access with C multi-dimensional arrays */
130*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecSetRandom(x,rctx));
131*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecSet(y,0.0));
132*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecGetArrayRead(x,&x2));
133*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecGetArray(y,&y2));
134*5f80ce2aSJacob Faibussowitsch     CHKERRQ(Update2(da,x2,y2,nwarm,nloop,&avgTime));
135*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecRestoreArrayRead(x,&x2));
136*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecRestoreArray(y,&y2));
137*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- C average time      = %e\n",len,avgTime));
138d16a9515SJunchao Zhang 
139d16a9515SJunchao Zhang     /* Access with Kokkos multi-dimensional OffsetViews */
140*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecSet(y,0.0));
141*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecSetRandom(x,rctx));
142*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecGetKokkosOffsetView(da,x,&x3));
143*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecGetKokkosOffsetView(da,y,&y3));
144d16a9515SJunchao Zhang 
145d16a9515SJunchao Zhang     for (PetscInt it=0; it<nwarm+nloop; it++) {
146*5f80ce2aSJacob Faibussowitsch       if (it == nwarm) CHKERRQ(PetscTime(&tstart));
147d16a9515SJunchao Zhang       Kokkos::parallel_for("stencil",MDRangePolicy<Kokkos::DefaultHostExecutionSpace,Rank<3,Iterate::Right,Iterate::Right>>({zs,ys,xs},{zs+zm,ys+ym,xs+xm}),
148d16a9515SJunchao Zhang         KOKKOS_LAMBDA(PetscInt k,PetscInt j,PetscInt i) {
149d16a9515SJunchao Zhang         y3(k,j,i) = 6*x3(k,j,i) - x3(k-1,j,i) - x3(k,j-1,i) - x3(k,j,i-1)
150d16a9515SJunchao Zhang                                 - x3(k+1,j,i) - x3(k,j+1,i) - x3(k,j,i+1);
151d16a9515SJunchao Zhang       });
152d16a9515SJunchao Zhang     }
153*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscTime(&tend));
154*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecRestoreKokkosOffsetView(da,x,&x3));
155*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDAVecRestoreKokkosOffsetView(da,y,&y3));
156d16a9515SJunchao Zhang     avgTime = (tend - tstart)/nloop;
157*5f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- Kokkos average time = %e\n",len,avgTime));
158d16a9515SJunchao Zhang 
159*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecDestroy(&x));
160*5f80ce2aSJacob Faibussowitsch     CHKERRQ(VecDestroy(&y));
161*5f80ce2aSJacob Faibussowitsch     CHKERRQ(DMDestroy(&da));
162d16a9515SJunchao Zhang   }
163*5f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscRandomDestroy(&rctx));
164d16a9515SJunchao Zhang   ierr = PetscFinalize();
165d16a9515SJunchao Zhang   return ierr;
166d16a9515SJunchao Zhang }
167d16a9515SJunchao Zhang 
168d16a9515SJunchao Zhang /*TEST
169d16a9515SJunchao Zhang   build:
170d16a9515SJunchao Zhang     requires: kokkos_kernels
171d16a9515SJunchao Zhang 
172d16a9515SJunchao Zhang   test:
173d16a9515SJunchao Zhang     suffix: 1
174d16a9515SJunchao Zhang     requires: kokkos_kernels
175d16a9515SJunchao Zhang     args: -min 32 -max 32 -dm_vec_type kokkos
176d16a9515SJunchao Zhang     filter: grep -v "time"
177d16a9515SJunchao Zhang 
178d16a9515SJunchao Zhang TEST*/
179