1d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n"; 2d16a9515SJunchao Zhang 3d16a9515SJunchao Zhang /* 4d16a9515SJunchao Zhang On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos: 5d16a9515SJunchao Zhang Time (sec.), on Mar. 1, 2022 6d16a9515SJunchao Zhang ------------------------------------------ 7d16a9515SJunchao Zhang n PETSc C Kokkos 8d16a9515SJunchao Zhang ------------------------------------------ 9d16a9515SJunchao Zhang 32 4.6464E-05 4.7451E-05 1.6880E-04 10d16a9515SJunchao Zhang 64 2.5654E-04 2.5164E-04 5.6780E-04 11d16a9515SJunchao Zhang 128 1.9383E-03 1.8878E-03 4.7938E-03 12d16a9515SJunchao Zhang 256 1.4450E-02 1.3619E-02 3.7778E-02 13d16a9515SJunchao Zhang 512 1.1580E-01 1.1551E-01 2.8428E-01 14d16a9515SJunchao Zhang 1024 1.4179E+00 1.3772E+00 2.2873E+00 15d16a9515SJunchao Zhang 16d16a9515SJunchao Zhang Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc 17d16a9515SJunchao Zhang */ 18d16a9515SJunchao Zhang 19d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp> 20d16a9515SJunchao Zhang #include <petscdm.h> 21d16a9515SJunchao Zhang #include <petscdmda.h> 22d16a9515SJunchao Zhang 23209362dfSJunchao Zhang using Kokkos::Iterate; 24209362dfSJunchao Zhang using Kokkos::MDRangePolicy; 25209362dfSJunchao Zhang using Kokkos::Rank; 26d16a9515SJunchao Zhang using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>; 27d16a9515SJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>; 28d16a9515SJunchao Zhang 29d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */ 30d71ae5a4SJacob Faibussowitsch static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime) 31d71ae5a4SJacob Faibussowitsch { 32d16a9515SJunchao Zhang PetscInt it, i, j, k; 33d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0, tend; 34d16a9515SJunchao Zhang PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs; 35d16a9515SJunchao Zhang 36d16a9515SJunchao Zhang PetscFunctionBegin; 379566063dSJacob Faibussowitsch PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm)); 389566063dSJacob Faibussowitsch PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm)); 39d16a9515SJunchao Zhang for (it = 0; it < nwarm + nloop; it++) { 409566063dSJacob Faibussowitsch if (it == nwarm) PetscCall(PetscTime(&tstart)); 41d16a9515SJunchao Zhang for (k = zs; k < zs + zm; k++) { 42d16a9515SJunchao Zhang for (j = ys; j < ys + ym; j++) { 43ad540459SPierre Jolivet for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1]; 44d16a9515SJunchao Zhang } 45d16a9515SJunchao Zhang } 46d16a9515SJunchao Zhang } 479566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend)); 48d16a9515SJunchao Zhang *avgTime = (tend - tstart) / nloop; 493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 50d16a9515SJunchao Zhang } 51d16a9515SJunchao Zhang 52d16a9515SJunchao Zhang /* C multi-dimensional array access */ 53d71ae5a4SJacob Faibussowitsch static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime) 54d71ae5a4SJacob Faibussowitsch { 55d16a9515SJunchao Zhang PetscInt it, i, j, k; 56d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0, tend; 57d16a9515SJunchao Zhang PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs; 58d16a9515SJunchao Zhang 59d16a9515SJunchao Zhang PetscFunctionBegin; 609566063dSJacob Faibussowitsch PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm)); 619566063dSJacob Faibussowitsch PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm)); 62d16a9515SJunchao Zhang #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)] 63d16a9515SJunchao Zhang #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)] 64d16a9515SJunchao Zhang for (it = 0; it < nwarm + nloop; it++) { 659566063dSJacob Faibussowitsch if (it == nwarm) PetscCall(PetscTime(&tstart)); 66d16a9515SJunchao Zhang for (k = zs; k < zs + zm; k++) { 67d16a9515SJunchao Zhang for (j = ys; j < ys + ym; j++) { 68ad540459SPierre Jolivet for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1); 69d16a9515SJunchao Zhang } 70d16a9515SJunchao Zhang } 71d16a9515SJunchao Zhang } 729566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend)); 73d16a9515SJunchao Zhang *avgTime = (tend - tstart) / nloop; 74d16a9515SJunchao Zhang #undef X2 75d16a9515SJunchao Zhang #undef Y2 763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 77d16a9515SJunchao Zhang } 78d16a9515SJunchao Zhang 79d71ae5a4SJacob Faibussowitsch int main(int argc, char **argv) 80d71ae5a4SJacob Faibussowitsch { 81d16a9515SJunchao Zhang DM da; 82d16a9515SJunchao Zhang PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs; 83d16a9515SJunchao Zhang PetscInt dof = 1, sw = 1; 84d16a9515SJunchao Zhang DMBoundaryType bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC; 85d16a9515SJunchao Zhang DMDAStencilType st = DMDA_STENCIL_STAR; 86d16a9515SJunchao Zhang Vec x, y; /* local/global vectors of the da */ 87d16a9515SJunchao Zhang PetscRandom rctx; 88d16a9515SJunchao Zhang const PetscScalar ***x1; 89d16a9515SJunchao Zhang PetscScalar ***y1; /* arrays of g, ll */ 90d16a9515SJunchao Zhang const PetscScalar *x2; 91d16a9515SJunchao Zhang PetscScalar *y2; 92d16a9515SJunchao Zhang ConstPetscScalarKokkosOffsetView3D x3; 93d16a9515SJunchao Zhang PetscScalarKokkosOffsetView3D y3; 94d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0, tend = 0.0, avgTime = 0.0; 95d16a9515SJunchao Zhang PetscInt nwarm = 2, nloop = 10; 96d16a9515SJunchao Zhang PetscInt min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */ 97d16a9515SJunchao Zhang 98327415f7SBarry Smith PetscFunctionBeginUser; 999566063dSJacob Faibussowitsch PetscCall(PetscInitialize(&argc, &argv, (char *)0, help)); 1009566063dSJacob Faibussowitsch PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx)); 1019566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL)); 1029566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL)); 103d16a9515SJunchao Zhang 104d16a9515SJunchao Zhang for (PetscInt len = min; len <= max; len = len * 2) { 1059566063dSJacob Faibussowitsch PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da)); 1069566063dSJacob Faibussowitsch PetscCall(DMSetFromOptions(da)); 1079566063dSJacob Faibussowitsch PetscCall(DMSetUp(da)); 108d16a9515SJunchao Zhang 1099566063dSJacob Faibussowitsch PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm)); 1109566063dSJacob Faibussowitsch PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm)); 1119566063dSJacob Faibussowitsch PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */ 1129566063dSJacob Faibussowitsch PetscCall(DMCreateGlobalVector(da, &y)); 113d16a9515SJunchao Zhang 114d16a9515SJunchao Zhang /* Access with petsc multi-dimensional arrays */ 1159566063dSJacob Faibussowitsch PetscCall(VecSetRandom(x, rctx)); 1169566063dSJacob Faibussowitsch PetscCall(VecSet(y, 0.0)); 1179566063dSJacob Faibussowitsch PetscCall(DMDAVecGetArrayRead(da, x, &x1)); 1189566063dSJacob Faibussowitsch PetscCall(DMDAVecGetArray(da, y, &y1)); 1199566063dSJacob Faibussowitsch PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime)); 1209566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreArrayRead(da, x, &x1)); 1219566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreArray(da, y, &y1)); 1229566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend)); 123*309cecaaSJunchao Zhang PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time = %e\n", static_cast<int>(len), avgTime)); 124d16a9515SJunchao Zhang 125d16a9515SJunchao Zhang /* Access with C multi-dimensional arrays */ 1269566063dSJacob Faibussowitsch PetscCall(VecSetRandom(x, rctx)); 1279566063dSJacob Faibussowitsch PetscCall(VecSet(y, 0.0)); 1289566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(x, &x2)); 1299566063dSJacob Faibussowitsch PetscCall(VecGetArray(y, &y2)); 1309566063dSJacob Faibussowitsch PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime)); 1319566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(x, &x2)); 1329566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(y, &y2)); 133*309cecaaSJunchao Zhang PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time = %e\n", static_cast<int>(len), avgTime)); 134d16a9515SJunchao Zhang 135d16a9515SJunchao Zhang /* Access with Kokkos multi-dimensional OffsetViews */ 1369566063dSJacob Faibussowitsch PetscCall(VecSet(y, 0.0)); 1379566063dSJacob Faibussowitsch PetscCall(VecSetRandom(x, rctx)); 1389566063dSJacob Faibussowitsch PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3)); 1399566063dSJacob Faibussowitsch PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3)); 140d16a9515SJunchao Zhang 141d16a9515SJunchao Zhang for (PetscInt it = 0; it < nwarm + nloop; it++) { 1429566063dSJacob Faibussowitsch if (it == nwarm) PetscCall(PetscTime(&tstart)); 1439371c9d4SSatish Balay Kokkos::parallel_for( 1449371c9d4SSatish Balay "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}), 1459371c9d4SSatish Balay KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); }); 146d16a9515SJunchao Zhang } 1479566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend)); 1489566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3)); 1499566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3)); 150d16a9515SJunchao Zhang avgTime = (tend - tstart) / nloop; 151*309cecaaSJunchao Zhang PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", static_cast<int>(len), avgTime)); 152d16a9515SJunchao Zhang 1539566063dSJacob Faibussowitsch PetscCall(VecDestroy(&x)); 1549566063dSJacob Faibussowitsch PetscCall(VecDestroy(&y)); 1559566063dSJacob Faibussowitsch PetscCall(DMDestroy(&da)); 156d16a9515SJunchao Zhang } 1579566063dSJacob Faibussowitsch PetscCall(PetscRandomDestroy(&rctx)); 1589566063dSJacob Faibussowitsch PetscCall(PetscFinalize()); 159b122ec5aSJacob Faibussowitsch return 0; 160d16a9515SJunchao Zhang } 161d16a9515SJunchao Zhang 162d16a9515SJunchao Zhang /*TEST 163d16a9515SJunchao Zhang build: 164d16a9515SJunchao Zhang requires: kokkos_kernels 165d16a9515SJunchao Zhang 166d16a9515SJunchao Zhang test: 167d16a9515SJunchao Zhang suffix: 1 168d16a9515SJunchao Zhang requires: kokkos_kernels 169d16a9515SJunchao Zhang args: -min 32 -max 32 -dm_vec_type kokkos 170d16a9515SJunchao Zhang filter: grep -v "time" 171d16a9515SJunchao Zhang 172d16a9515SJunchao Zhang TEST*/ 173