xref: /petsc/src/dm/tests/ex2k.kokkos.cxx (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
2 
3 /*
4   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5            Time (sec.), on Mar. 1, 2022
6   ------------------------------------------
7   n     PETSc          C          Kokkos
8   ------------------------------------------
9   32    4.6464E-05  4.7451E-05  1.6880E-04
10   64    2.5654E-04  2.5164E-04  5.6780E-04
11   128   1.9383E-03  1.8878E-03  4.7938E-03
12   256   1.4450E-02  1.3619E-02  3.7778E-02
13   512   1.1580E-01  1.1551E-01  2.8428E-01
14   1024  1.4179E+00  1.3772E+00  2.2873E+00
15 
16   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17 */
18 
19 #include <petscdmda_kokkos.hpp>
20 #include <petscdm.h>
21 #include <petscdmda.h>
22 
23 using Kokkos::Iterate;
24 using Kokkos::MDRangePolicy;
25 using Kokkos::Rank;
26 using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
27 using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
28 
29 /* PETSc multi-dimensional array access */
30 static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime) {
31   PetscInt       it, i, j, k;
32   PetscLogDouble tstart = 0.0, tend;
33   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
34 
35   PetscFunctionBegin;
36   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
37   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
38   for (it = 0; it < nwarm + nloop; it++) {
39     if (it == nwarm) PetscCall(PetscTime(&tstart));
40     for (k = zs; k < zs + zm; k++) {
41       for (j = ys; j < ys + ym; j++) {
42         for (i = xs; i < xs + xm; i++) { y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1]; }
43       }
44     }
45   }
46   PetscCall(PetscTime(&tend));
47   *avgTime = (tend - tstart) / nloop;
48   PetscFunctionReturn(0);
49 }
50 
51 /* C multi-dimensional array access */
52 static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime) {
53   PetscInt       it, i, j, k;
54   PetscLogDouble tstart = 0.0, tend;
55   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
56 
57   PetscFunctionBegin;
58   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
59   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
60 #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
61 #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
62   for (it = 0; it < nwarm + nloop; it++) {
63     if (it == nwarm) PetscCall(PetscTime(&tstart));
64     for (k = zs; k < zs + zm; k++) {
65       for (j = ys; j < ys + ym; j++) {
66         for (i = xs; i < xs + xm; i++) { Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1); }
67       }
68     }
69   }
70   PetscCall(PetscTime(&tend));
71   *avgTime = (tend - tstart) / nloop;
72 #undef X2
73 #undef Y2
74   PetscFunctionReturn(0);
75 }
76 
77 int main(int argc, char **argv) {
78   DM                                 da;
79   PetscInt                           xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
80   PetscInt                           dof = 1, sw = 1;
81   DMBoundaryType                     bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
82   DMDAStencilType                    st = DMDA_STENCIL_STAR;
83   Vec                                x, y; /* local/global vectors of the da */
84   PetscRandom                        rctx;
85   const PetscScalar               ***x1;
86   PetscScalar                     ***y1; /* arrays of g, ll */
87   const PetscScalar                 *x2;
88   PetscScalar                       *y2;
89   ConstPetscScalarKokkosOffsetView3D x3;
90   PetscScalarKokkosOffsetView3D      y3;
91   PetscLogDouble                     tstart = 0.0, tend = 0.0, avgTime = 0.0;
92   PetscInt                           nwarm = 2, nloop = 10;
93   PetscInt                           min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */
94 
95   PetscFunctionBeginUser;
96   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
97   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
98   PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL));
99   PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL));
100 
101   for (PetscInt len = min; len <= max; len = len * 2) {
102     PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da));
103     PetscCall(DMSetFromOptions(da));
104     PetscCall(DMSetUp(da));
105 
106     PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
107     PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
108     PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */
109     PetscCall(DMCreateGlobalVector(da, &y));
110 
111     /* Access with petsc multi-dimensional arrays */
112     PetscCall(VecSetRandom(x, rctx));
113     PetscCall(VecSet(y, 0.0));
114     PetscCall(DMDAVecGetArrayRead(da, x, &x1));
115     PetscCall(DMDAVecGetArray(da, y, &y1));
116     PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime));
117     PetscCall(DMDAVecRestoreArrayRead(da, x, &x1));
118     PetscCall(DMDAVecRestoreArray(da, y, &y1));
119     PetscCall(PetscTime(&tend));
120     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time  = %e\n", len, avgTime));
121 
122     /* Access with C multi-dimensional arrays */
123     PetscCall(VecSetRandom(x, rctx));
124     PetscCall(VecSet(y, 0.0));
125     PetscCall(VecGetArrayRead(x, &x2));
126     PetscCall(VecGetArray(y, &y2));
127     PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime));
128     PetscCall(VecRestoreArrayRead(x, &x2));
129     PetscCall(VecRestoreArray(y, &y2));
130     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time      = %e\n", len, avgTime));
131 
132     /* Access with Kokkos multi-dimensional OffsetViews */
133     PetscCall(VecSet(y, 0.0));
134     PetscCall(VecSetRandom(x, rctx));
135     PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3));
136     PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3));
137 
138     for (PetscInt it = 0; it < nwarm + nloop; it++) {
139       if (it == nwarm) PetscCall(PetscTime(&tstart));
140       Kokkos::parallel_for(
141         "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
142         KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
143     }
144     PetscCall(PetscTime(&tend));
145     PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3));
146     PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3));
147     avgTime = (tend - tstart) / nloop;
148     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", len, avgTime));
149 
150     PetscCall(VecDestroy(&x));
151     PetscCall(VecDestroy(&y));
152     PetscCall(DMDestroy(&da));
153   }
154   PetscCall(PetscRandomDestroy(&rctx));
155   PetscCall(PetscFinalize());
156   return 0;
157 }
158 
159 /*TEST
160   build:
161     requires: kokkos_kernels
162 
163   test:
164     suffix: 1
165     requires: kokkos_kernels
166     args: -min 32 -max 32 -dm_vec_type kokkos
167     filter: grep -v "time"
168 
169 TEST*/
170