xref: /petsc/src/dm/tests/ex2k.kokkos.cxx (revision ffa8c5705e8ab2cf85ee1d14dbe507a6e2eb5283)
1 static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
2 
3 /*
4   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5            Time (sec.), on Mar. 1, 2022
6   ------------------------------------------
7   n     PETSc          C          Kokkos
8   ------------------------------------------
9   32    4.6464E-05  4.7451E-05  1.6880E-04
10   64    2.5654E-04  2.5164E-04  5.6780E-04
11   128   1.9383E-03  1.8878E-03  4.7938E-03
12   256   1.4450E-02  1.3619E-02  3.7778E-02
13   512   1.1580E-01  1.1551E-01  2.8428E-01
14   1024  1.4179E+00  1.3772E+00  2.2873E+00
15 
16   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17 */
18 
19 #include <petscdmda_kokkos.hpp>
20 #include <petscdm.h>
21 #include <petscdmda.h>
22 
23 using namespace Kokkos;
24 using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar***,Kokkos::LayoutRight,Kokkos::HostSpace>;
25 using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar***, Kokkos::LayoutRight,Kokkos::HostSpace>;
26 
27 /* PETSc multi-dimensional array access */
28 static PetscErrorCode Update1(DM da,const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime)
29 {
30   PetscInt       it,i,j,k;
31   PetscLogDouble tstart = 0.0,tend;
32   PetscInt       xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
33 
34   PetscFunctionBegin;
35   PetscCall(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm));
36   PetscCall(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm));
37   for (it=0; it<nwarm+nloop; it++) {
38     if (it == nwarm) PetscCall(PetscTime(&tstart));
39     for (k=zs; k<zs+zm; k++) {
40       for (j=ys; j<ys+ym; j++) {
41         for (i=xs; i<xs+xm; i++) {
42           y1[k][j][i] = 6*x1[k][j][i] - x1[k-1][j][i] - x1[k][j-1][i] - x1[k][j][i-1]
43                                       - x1[k+1][j][i] - x1[k][j+1][i] - x1[k][j][i+1];
44         }
45       }
46     }
47   }
48   PetscCall(PetscTime(&tend));
49   *avgTime = (tend - tstart)/nloop;
50   PetscFunctionReturn(0);
51 }
52 
53 /* C multi-dimensional array access */
54 static PetscErrorCode Update2(DM da,const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm,PetscInt nloop,PetscLogDouble *avgTime)
55 {
56   PetscInt       it,i,j,k;
57   PetscLogDouble tstart = 0.0,tend;
58   PetscInt       xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
59 
60   PetscFunctionBegin;
61   PetscCall(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm));
62   PetscCall(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm));
63 #define X2(k,j,i) x2[(k-gzs)*gym*gxm+(j-gys)*gxm+(i-gxs)]
64 #define Y2(k,j,i) y2[(k-zs)*ym*xm+(j-ys)*xm+(i-xs)]
65   for (it=0; it<nwarm+nloop; it++) {
66     if (it == nwarm) PetscCall(PetscTime(&tstart));
67     for (k=zs; k<zs+zm; k++) {
68       for (j=ys; j<ys+ym; j++) {
69         for (i=xs; i<xs+xm; i++) {
70           Y2(k,j,i) = 6*X2(k,j,i) - X2(k-1,j,i) - X2(k,j-1,i) - X2(k,j,i-1)
71                                   - X2(k+1,j,i) - X2(k,j+1,i) - X2(k,j,i+1);
72         }
73       }
74     }
75   }
76   PetscCall(PetscTime(&tend));
77   *avgTime = (tend - tstart)/nloop;
78 #undef X2
79 #undef Y2
80   PetscFunctionReturn(0);
81 }
82 
83 int main(int argc,char **argv)
84 {
85   DM                                   da;
86   PetscInt                             xm,ym,zm,xs,ys,zs,gxm,gym,gzm,gxs,gys,gzs;
87   PetscInt                             dof = 1,sw = 1;
88   DMBoundaryType                       bx = DM_BOUNDARY_PERIODIC,by = DM_BOUNDARY_PERIODIC,bz = DM_BOUNDARY_PERIODIC;
89   DMDAStencilType                      st = DMDA_STENCIL_STAR;
90   Vec                                  x,y; /* local/global vectors of the da */
91   PetscRandom                          rctx;
92   const PetscScalar                    ***x1;
93   PetscScalar                          ***y1; /* arrays of g, ll */
94   const PetscScalar                    *x2;
95   PetscScalar                          *y2;
96   ConstPetscScalarKokkosOffsetView3D   x3;
97   PetscScalarKokkosOffsetView3D        y3;
98   PetscLogDouble                       tstart = 0.0,tend = 0.0,avgTime = 0.0;
99   PetscInt                             nwarm = 2, nloop = 10;
100   PetscInt                             min = 32, max = 32*8; /* min and max sizes of the grids to sample */
101 
102   PetscCall(PetscInitialize(&argc,&argv,(char*)0,help));
103   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD,&rctx));
104   PetscCall(PetscOptionsGetInt(NULL,NULL,"-min",&min,NULL));
105   PetscCall(PetscOptionsGetInt(NULL,NULL,"-max",&max,NULL));
106 
107   for (PetscInt len=min; len<=max; len=len*2) {
108     PetscCall(DMDACreate3d(PETSC_COMM_WORLD,bx,by,bz,st,len,len,len,PETSC_DECIDE,PETSC_DECIDE,PETSC_DECIDE,dof,sw,0,0,0,&da));
109     PetscCall(DMSetFromOptions(da));
110     PetscCall(DMSetUp(da));
111 
112     PetscCall(DMDAGetCorners(da,&xs,&ys,&zs,&xm,&ym,&zm));
113     PetscCall(DMDAGetGhostCorners(da,&gxs,&gys,&gzs,&gxm,&gym,&gzm));
114     PetscCall(DMCreateLocalVector(da,&x)); /* Create local x and global y */
115     PetscCall(DMCreateGlobalVector(da,&y));
116 
117     /* Access with petsc multi-dimensional arrays */
118     PetscCall(VecSetRandom(x,rctx));
119     PetscCall(VecSet(y,0.0));
120     PetscCall(DMDAVecGetArrayRead(da,x,&x1));
121     PetscCall(DMDAVecGetArray(da,y,&y1));
122     PetscCall(Update1(da,x1,y1,nwarm,nloop,&avgTime));
123     PetscCall(DMDAVecRestoreArrayRead(da,x,&x1));
124     PetscCall(DMDAVecRestoreArray(da,y,&y1));
125     PetscCall(PetscTime(&tend));
126     PetscCall(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- PETSc average time  = %e\n",len,avgTime));
127 
128     /* Access with C multi-dimensional arrays */
129     PetscCall(VecSetRandom(x,rctx));
130     PetscCall(VecSet(y,0.0));
131     PetscCall(VecGetArrayRead(x,&x2));
132     PetscCall(VecGetArray(y,&y2));
133     PetscCall(Update2(da,x2,y2,nwarm,nloop,&avgTime));
134     PetscCall(VecRestoreArrayRead(x,&x2));
135     PetscCall(VecRestoreArray(y,&y2));
136     PetscCall(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- C average time      = %e\n",len,avgTime));
137 
138     /* Access with Kokkos multi-dimensional OffsetViews */
139     PetscCall(VecSet(y,0.0));
140     PetscCall(VecSetRandom(x,rctx));
141     PetscCall(DMDAVecGetKokkosOffsetView(da,x,&x3));
142     PetscCall(DMDAVecGetKokkosOffsetView(da,y,&y3));
143 
144     for (PetscInt it=0; it<nwarm+nloop; it++) {
145       if (it == nwarm) PetscCall(PetscTime(&tstart));
146       Kokkos::parallel_for("stencil",MDRangePolicy<Kokkos::DefaultHostExecutionSpace,Rank<3,Iterate::Right,Iterate::Right>>({zs,ys,xs},{zs+zm,ys+ym,xs+xm}),
147         KOKKOS_LAMBDA(PetscInt k,PetscInt j,PetscInt i) {
148         y3(k,j,i) = 6*x3(k,j,i) - x3(k-1,j,i) - x3(k,j-1,i) - x3(k,j,i-1)
149                                 - x3(k+1,j,i) - x3(k,j+1,i) - x3(k,j,i+1);
150       });
151     }
152     PetscCall(PetscTime(&tend));
153     PetscCall(DMDAVecRestoreKokkosOffsetView(da,x,&x3));
154     PetscCall(DMDAVecRestoreKokkosOffsetView(da,y,&y3));
155     avgTime = (tend - tstart)/nloop;
156     PetscCall(PetscPrintf(PETSC_COMM_WORLD,"%4d^3 -- Kokkos average time = %e\n",len,avgTime));
157 
158     PetscCall(VecDestroy(&x));
159     PetscCall(VecDestroy(&y));
160     PetscCall(DMDestroy(&da));
161   }
162   PetscCall(PetscRandomDestroy(&rctx));
163   PetscCall(PetscFinalize());
164   return 0;
165 }
166 
167 /*TEST
168   build:
169     requires: kokkos_kernels
170 
171   test:
172     suffix: 1
173     requires: kokkos_kernels
174     args: -min 32 -max 32 -dm_vec_type kokkos
175     filter: grep -v "time"
176 
177 TEST*/
178