xref: /petsc/src/dm/tests/ex2k.kokkos.cxx (revision 309cecaa259e96d4f066f69fae1d7c5cc1b2c213)
1d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
2d16a9515SJunchao Zhang 
3d16a9515SJunchao Zhang /*
4d16a9515SJunchao Zhang   On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5d16a9515SJunchao Zhang            Time (sec.), on Mar. 1, 2022
6d16a9515SJunchao Zhang   ------------------------------------------
7d16a9515SJunchao Zhang   n     PETSc          C          Kokkos
8d16a9515SJunchao Zhang   ------------------------------------------
9d16a9515SJunchao Zhang   32    4.6464E-05  4.7451E-05  1.6880E-04
10d16a9515SJunchao Zhang   64    2.5654E-04  2.5164E-04  5.6780E-04
11d16a9515SJunchao Zhang   128   1.9383E-03  1.8878E-03  4.7938E-03
12d16a9515SJunchao Zhang   256   1.4450E-02  1.3619E-02  3.7778E-02
13d16a9515SJunchao Zhang   512   1.1580E-01  1.1551E-01  2.8428E-01
14d16a9515SJunchao Zhang   1024  1.4179E+00  1.3772E+00  2.2873E+00
15d16a9515SJunchao Zhang 
16d16a9515SJunchao Zhang   Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17d16a9515SJunchao Zhang */
18d16a9515SJunchao Zhang 
19d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp>
20d16a9515SJunchao Zhang #include <petscdm.h>
21d16a9515SJunchao Zhang #include <petscdmda.h>
22d16a9515SJunchao Zhang 
23209362dfSJunchao Zhang using Kokkos::Iterate;
24209362dfSJunchao Zhang using Kokkos::MDRangePolicy;
25209362dfSJunchao Zhang using Kokkos::Rank;
26d16a9515SJunchao Zhang using PetscScalarKokkosOffsetView3D      = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
27d16a9515SJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>;
28d16a9515SJunchao Zhang 
29d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */
30d71ae5a4SJacob Faibussowitsch static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
31d71ae5a4SJacob Faibussowitsch {
32d16a9515SJunchao Zhang   PetscInt       it, i, j, k;
33d16a9515SJunchao Zhang   PetscLogDouble tstart = 0.0, tend;
34d16a9515SJunchao Zhang   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
35d16a9515SJunchao Zhang 
36d16a9515SJunchao Zhang   PetscFunctionBegin;
379566063dSJacob Faibussowitsch   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
389566063dSJacob Faibussowitsch   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
39d16a9515SJunchao Zhang   for (it = 0; it < nwarm + nloop; it++) {
409566063dSJacob Faibussowitsch     if (it == nwarm) PetscCall(PetscTime(&tstart));
41d16a9515SJunchao Zhang     for (k = zs; k < zs + zm; k++) {
42d16a9515SJunchao Zhang       for (j = ys; j < ys + ym; j++) {
43ad540459SPierre Jolivet         for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
44d16a9515SJunchao Zhang       }
45d16a9515SJunchao Zhang     }
46d16a9515SJunchao Zhang   }
479566063dSJacob Faibussowitsch   PetscCall(PetscTime(&tend));
48d16a9515SJunchao Zhang   *avgTime = (tend - tstart) / nloop;
493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
50d16a9515SJunchao Zhang }
51d16a9515SJunchao Zhang 
52d16a9515SJunchao Zhang /* C multi-dimensional array access */
53d71ae5a4SJacob Faibussowitsch static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
54d71ae5a4SJacob Faibussowitsch {
55d16a9515SJunchao Zhang   PetscInt       it, i, j, k;
56d16a9515SJunchao Zhang   PetscLogDouble tstart = 0.0, tend;
57d16a9515SJunchao Zhang   PetscInt       xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
58d16a9515SJunchao Zhang 
59d16a9515SJunchao Zhang   PetscFunctionBegin;
609566063dSJacob Faibussowitsch   PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
619566063dSJacob Faibussowitsch   PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
62d16a9515SJunchao Zhang #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
63d16a9515SJunchao Zhang #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
64d16a9515SJunchao Zhang   for (it = 0; it < nwarm + nloop; it++) {
659566063dSJacob Faibussowitsch     if (it == nwarm) PetscCall(PetscTime(&tstart));
66d16a9515SJunchao Zhang     for (k = zs; k < zs + zm; k++) {
67d16a9515SJunchao Zhang       for (j = ys; j < ys + ym; j++) {
68ad540459SPierre Jolivet         for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
69d16a9515SJunchao Zhang       }
70d16a9515SJunchao Zhang     }
71d16a9515SJunchao Zhang   }
729566063dSJacob Faibussowitsch   PetscCall(PetscTime(&tend));
73d16a9515SJunchao Zhang   *avgTime = (tend - tstart) / nloop;
74d16a9515SJunchao Zhang #undef X2
75d16a9515SJunchao Zhang #undef Y2
763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
77d16a9515SJunchao Zhang }
78d16a9515SJunchao Zhang 
79d71ae5a4SJacob Faibussowitsch int main(int argc, char **argv)
80d71ae5a4SJacob Faibussowitsch {
81d16a9515SJunchao Zhang   DM                                 da;
82d16a9515SJunchao Zhang   PetscInt                           xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
83d16a9515SJunchao Zhang   PetscInt                           dof = 1, sw = 1;
84d16a9515SJunchao Zhang   DMBoundaryType                     bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
85d16a9515SJunchao Zhang   DMDAStencilType                    st = DMDA_STENCIL_STAR;
86d16a9515SJunchao Zhang   Vec                                x, y; /* local/global vectors of the da */
87d16a9515SJunchao Zhang   PetscRandom                        rctx;
88d16a9515SJunchao Zhang   const PetscScalar               ***x1;
89d16a9515SJunchao Zhang   PetscScalar                     ***y1; /* arrays of g, ll */
90d16a9515SJunchao Zhang   const PetscScalar                 *x2;
91d16a9515SJunchao Zhang   PetscScalar                       *y2;
92d16a9515SJunchao Zhang   ConstPetscScalarKokkosOffsetView3D x3;
93d16a9515SJunchao Zhang   PetscScalarKokkosOffsetView3D      y3;
94d16a9515SJunchao Zhang   PetscLogDouble                     tstart = 0.0, tend = 0.0, avgTime = 0.0;
95d16a9515SJunchao Zhang   PetscInt                           nwarm = 2, nloop = 10;
96d16a9515SJunchao Zhang   PetscInt                           min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */
97d16a9515SJunchao Zhang 
98327415f7SBarry Smith   PetscFunctionBeginUser;
999566063dSJacob Faibussowitsch   PetscCall(PetscInitialize(&argc, &argv, (char *)0, help));
1009566063dSJacob Faibussowitsch   PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
1019566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL));
1029566063dSJacob Faibussowitsch   PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL));
103d16a9515SJunchao Zhang 
104d16a9515SJunchao Zhang   for (PetscInt len = min; len <= max; len = len * 2) {
1059566063dSJacob Faibussowitsch     PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da));
1069566063dSJacob Faibussowitsch     PetscCall(DMSetFromOptions(da));
1079566063dSJacob Faibussowitsch     PetscCall(DMSetUp(da));
108d16a9515SJunchao Zhang 
1099566063dSJacob Faibussowitsch     PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
1109566063dSJacob Faibussowitsch     PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
1119566063dSJacob Faibussowitsch     PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */
1129566063dSJacob Faibussowitsch     PetscCall(DMCreateGlobalVector(da, &y));
113d16a9515SJunchao Zhang 
114d16a9515SJunchao Zhang     /* Access with petsc multi-dimensional arrays */
1159566063dSJacob Faibussowitsch     PetscCall(VecSetRandom(x, rctx));
1169566063dSJacob Faibussowitsch     PetscCall(VecSet(y, 0.0));
1179566063dSJacob Faibussowitsch     PetscCall(DMDAVecGetArrayRead(da, x, &x1));
1189566063dSJacob Faibussowitsch     PetscCall(DMDAVecGetArray(da, y, &y1));
1199566063dSJacob Faibussowitsch     PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime));
1209566063dSJacob Faibussowitsch     PetscCall(DMDAVecRestoreArrayRead(da, x, &x1));
1219566063dSJacob Faibussowitsch     PetscCall(DMDAVecRestoreArray(da, y, &y1));
1229566063dSJacob Faibussowitsch     PetscCall(PetscTime(&tend));
123*309cecaaSJunchao Zhang     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time  = %e\n", static_cast<int>(len), avgTime));
124d16a9515SJunchao Zhang 
125d16a9515SJunchao Zhang     /* Access with C multi-dimensional arrays */
1269566063dSJacob Faibussowitsch     PetscCall(VecSetRandom(x, rctx));
1279566063dSJacob Faibussowitsch     PetscCall(VecSet(y, 0.0));
1289566063dSJacob Faibussowitsch     PetscCall(VecGetArrayRead(x, &x2));
1299566063dSJacob Faibussowitsch     PetscCall(VecGetArray(y, &y2));
1309566063dSJacob Faibussowitsch     PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime));
1319566063dSJacob Faibussowitsch     PetscCall(VecRestoreArrayRead(x, &x2));
1329566063dSJacob Faibussowitsch     PetscCall(VecRestoreArray(y, &y2));
133*309cecaaSJunchao Zhang     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time      = %e\n", static_cast<int>(len), avgTime));
134d16a9515SJunchao Zhang 
135d16a9515SJunchao Zhang     /* Access with Kokkos multi-dimensional OffsetViews */
1369566063dSJacob Faibussowitsch     PetscCall(VecSet(y, 0.0));
1379566063dSJacob Faibussowitsch     PetscCall(VecSetRandom(x, rctx));
1389566063dSJacob Faibussowitsch     PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3));
1399566063dSJacob Faibussowitsch     PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3));
140d16a9515SJunchao Zhang 
141d16a9515SJunchao Zhang     for (PetscInt it = 0; it < nwarm + nloop; it++) {
1429566063dSJacob Faibussowitsch       if (it == nwarm) PetscCall(PetscTime(&tstart));
1439371c9d4SSatish Balay       Kokkos::parallel_for(
1449371c9d4SSatish Balay         "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
1459371c9d4SSatish Balay         KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
146d16a9515SJunchao Zhang     }
1479566063dSJacob Faibussowitsch     PetscCall(PetscTime(&tend));
1489566063dSJacob Faibussowitsch     PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3));
1499566063dSJacob Faibussowitsch     PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3));
150d16a9515SJunchao Zhang     avgTime = (tend - tstart) / nloop;
151*309cecaaSJunchao Zhang     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", static_cast<int>(len), avgTime));
152d16a9515SJunchao Zhang 
1539566063dSJacob Faibussowitsch     PetscCall(VecDestroy(&x));
1549566063dSJacob Faibussowitsch     PetscCall(VecDestroy(&y));
1559566063dSJacob Faibussowitsch     PetscCall(DMDestroy(&da));
156d16a9515SJunchao Zhang   }
1579566063dSJacob Faibussowitsch   PetscCall(PetscRandomDestroy(&rctx));
1589566063dSJacob Faibussowitsch   PetscCall(PetscFinalize());
159b122ec5aSJacob Faibussowitsch   return 0;
160d16a9515SJunchao Zhang }
161d16a9515SJunchao Zhang 
162d16a9515SJunchao Zhang /*TEST
163d16a9515SJunchao Zhang   build:
164d16a9515SJunchao Zhang     requires: kokkos_kernels
165d16a9515SJunchao Zhang 
166d16a9515SJunchao Zhang   test:
167d16a9515SJunchao Zhang     suffix: 1
168d16a9515SJunchao Zhang     requires: kokkos_kernels
169d16a9515SJunchao Zhang     args: -min 32 -max 32 -dm_vec_type kokkos
170d16a9515SJunchao Zhang     filter: grep -v "time"
171d16a9515SJunchao Zhang 
172d16a9515SJunchao Zhang TEST*/
173