xref: /petsc/src/sys/objects/device/tests/ex1k.kokkos.cxx (revision 896e5da26b7afa0f7fe0b6f47059786607b5d79f)
1*896e5da2SSatish Balay static char help[] = "Benchmarking device kernel launch time\n";
2*896e5da2SSatish Balay /*
3*896e5da2SSatish Balay   Running example on Summit at OLCF:
4*896e5da2SSatish Balay   # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
5*896e5da2SSatish Balay   $ jsrun -n1 -a1 -c7 -g1 -r1  ./ex1k
6*896e5da2SSatish Balay   Average asynchronous device kernel launch time = 4.86 microseconds
7*896e5da2SSatish Balay   Average synchronous device kernel launch time  = 12.83 microseconds
8*896e5da2SSatish Balay 
9*896e5da2SSatish Balay   Frontier@OLCF
10*896e5da2SSatish Balay   $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1k
11*896e5da2SSatish Balay   Average asynchronous device kernel launch time = 1.88 microseconds
12*896e5da2SSatish Balay   Average synchronous device kernel launch time  = 7.78 microseconds
13*896e5da2SSatish Balay 
14*896e5da2SSatish Balay   Aurora@ALCF
15*896e5da2SSatish Balay   $ mpirun -n 1 ./ex1k
16*896e5da2SSatish Balay   Average asynchronous device kernel launch time = 3.34 microseconds
17*896e5da2SSatish Balay   Average synchronous device kernel launch time  = 6.24 microseconds
18*896e5da2SSatish Balay 
19*896e5da2SSatish Balay   Perlmutter@NERSC
20*896e5da2SSatish Balay   $ srun -n 1 --gpus-per-task=1 ./ex1k
21*896e5da2SSatish Balay   Average asynchronous device kernel launch time = 2.31 microseconds
22*896e5da2SSatish Balay   Average synchronous device kernel launch time  = 7.13 microseconds
23*896e5da2SSatish Balay */
24*896e5da2SSatish Balay 
25*896e5da2SSatish Balay #include <petscsys.h>
26*896e5da2SSatish Balay #include <petsc_kokkos.hpp>
27*896e5da2SSatish Balay 
28*896e5da2SSatish Balay int main(int argc, char **argv)
29*896e5da2SSatish Balay {
30*896e5da2SSatish Balay   PetscInt       i, n = 100000, N = 256;
31*896e5da2SSatish Balay   PetscLogDouble tstart, tend, time;
32*896e5da2SSatish Balay 
33*896e5da2SSatish Balay   PetscFunctionBeginUser;
34*896e5da2SSatish Balay   PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
35*896e5da2SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
36*896e5da2SSatish Balay   PetscCall(PetscKokkosInitializeCheck());
37*896e5da2SSatish Balay   {
38*896e5da2SSatish Balay     Kokkos::DefaultExecutionSpace                      exec = PetscGetKokkosExecutionSpace();
39*896e5da2SSatish Balay     Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N);
40*896e5da2SSatish Balay 
41*896e5da2SSatish Balay     PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below
42*896e5da2SSatish Balay     // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one
43*896e5da2SSatish Balay     PetscCall(PetscTime(&tstart));
44*896e5da2SSatish Balay     for (i = 0; i < n; i++) PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
45*896e5da2SSatish Balay     PetscCall(PetscTime(&tend));
46*896e5da2SSatish Balay     PetscCallCXX(exec.fence());
47*896e5da2SSatish Balay     time = (tend - tstart) * 1e6 / n;
48*896e5da2SSatish Balay     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time));
49*896e5da2SSatish Balay 
50*896e5da2SSatish Balay     // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed
51*896e5da2SSatish Balay     PetscCall(PetscTime(&tstart));
52*896e5da2SSatish Balay     for (i = 0; i < n; i++) {
53*896e5da2SSatish Balay       PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
54*896e5da2SSatish Balay       PetscCallCXX(exec.fence());
55*896e5da2SSatish Balay     }
56*896e5da2SSatish Balay     PetscCall(PetscTime(&tend));
57*896e5da2SSatish Balay     time = (tend - tstart) * 1e6 / n;
58*896e5da2SSatish Balay     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time  = %.2f microseconds\n", time));
59*896e5da2SSatish Balay   }
60*896e5da2SSatish Balay 
61*896e5da2SSatish Balay   PetscCall(PetscFinalize());
62*896e5da2SSatish Balay   return 0;
63*896e5da2SSatish Balay }
64*896e5da2SSatish Balay 
65*896e5da2SSatish Balay /*TEST
66*896e5da2SSatish Balay   test:
67*896e5da2SSatish Balay     requires: kokkos
68*896e5da2SSatish Balay     args: -n 2
69*896e5da2SSatish Balay     output_file: output/empty.out
70*896e5da2SSatish Balay     filter: grep "DOES_NOT_EXIST"
71*896e5da2SSatish Balay 
72*896e5da2SSatish Balay TEST*/
73