1*896e5da2SSatish Balay static char help[] = "Benchmarking device kernel launch time\n"; 2*896e5da2SSatish Balay /* 3*896e5da2SSatish Balay Running example on Summit at OLCF: 4*896e5da2SSatish Balay # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS 5*896e5da2SSatish Balay $ jsrun -n1 -a1 -c7 -g1 -r1 ./ex1k 6*896e5da2SSatish Balay Average asynchronous device kernel launch time = 4.86 microseconds 7*896e5da2SSatish Balay Average synchronous device kernel launch time = 12.83 microseconds 8*896e5da2SSatish Balay 9*896e5da2SSatish Balay Frontier@OLCF 10*896e5da2SSatish Balay $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1k 11*896e5da2SSatish Balay Average asynchronous device kernel launch time = 1.88 microseconds 12*896e5da2SSatish Balay Average synchronous device kernel launch time = 7.78 microseconds 13*896e5da2SSatish Balay 14*896e5da2SSatish Balay Aurora@ALCF 15*896e5da2SSatish Balay $ mpirun -n 1 ./ex1k 16*896e5da2SSatish Balay Average asynchronous device kernel launch time = 3.34 microseconds 17*896e5da2SSatish Balay Average synchronous device kernel launch time = 6.24 microseconds 18*896e5da2SSatish Balay 19*896e5da2SSatish Balay Perlmutter@NERSC 20*896e5da2SSatish Balay $ srun -n 1 --gpus-per-task=1 ./ex1k 21*896e5da2SSatish Balay Average asynchronous device kernel launch time = 2.31 microseconds 22*896e5da2SSatish Balay Average synchronous device kernel launch time = 7.13 microseconds 23*896e5da2SSatish Balay */ 24*896e5da2SSatish Balay 25*896e5da2SSatish Balay #include <petscsys.h> 26*896e5da2SSatish Balay #include <petsc_kokkos.hpp> 27*896e5da2SSatish Balay 28*896e5da2SSatish Balay int main(int argc, char **argv) 29*896e5da2SSatish Balay { 30*896e5da2SSatish Balay PetscInt i, n = 100000, N = 256; 31*896e5da2SSatish Balay PetscLogDouble tstart, tend, time; 32*896e5da2SSatish Balay 33*896e5da2SSatish Balay PetscFunctionBeginUser; 34*896e5da2SSatish Balay PetscCall(PetscInitialize(&argc, &argv, nullptr, help)); 35*896e5da2SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL)); 36*896e5da2SSatish Balay PetscCall(PetscKokkosInitializeCheck()); 37*896e5da2SSatish Balay { 38*896e5da2SSatish Balay Kokkos::DefaultExecutionSpace exec = PetscGetKokkosExecutionSpace(); 39*896e5da2SSatish Balay Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N); 40*896e5da2SSatish Balay 41*896e5da2SSatish Balay PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below 42*896e5da2SSatish Balay // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one 43*896e5da2SSatish Balay PetscCall(PetscTime(&tstart)); 44*896e5da2SSatish Balay for (i = 0; i < n; i++) PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){})); 45*896e5da2SSatish Balay PetscCall(PetscTime(&tend)); 46*896e5da2SSatish Balay PetscCallCXX(exec.fence()); 47*896e5da2SSatish Balay time = (tend - tstart) * 1e6 / n; 48*896e5da2SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time)); 49*896e5da2SSatish Balay 50*896e5da2SSatish Balay // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed 51*896e5da2SSatish Balay PetscCall(PetscTime(&tstart)); 52*896e5da2SSatish Balay for (i = 0; i < n; i++) { 53*896e5da2SSatish Balay PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){})); 54*896e5da2SSatish Balay PetscCallCXX(exec.fence()); 55*896e5da2SSatish Balay } 56*896e5da2SSatish Balay PetscCall(PetscTime(&tend)); 57*896e5da2SSatish Balay time = (tend - tstart) * 1e6 / n; 58*896e5da2SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time = %.2f microseconds\n", time)); 59*896e5da2SSatish Balay } 60*896e5da2SSatish Balay 61*896e5da2SSatish Balay PetscCall(PetscFinalize()); 62*896e5da2SSatish Balay return 0; 63*896e5da2SSatish Balay } 64*896e5da2SSatish Balay 65*896e5da2SSatish Balay /*TEST 66*896e5da2SSatish Balay test: 67*896e5da2SSatish Balay requires: kokkos 68*896e5da2SSatish Balay args: -n 2 69*896e5da2SSatish Balay output_file: output/empty.out 70*896e5da2SSatish Balay filter: grep "DOES_NOT_EXIST" 71*896e5da2SSatish Balay 72*896e5da2SSatish Balay TEST*/ 73