xref: /petsc/src/vec/is/sf/tests/ex4k.kokkos.cxx (revision a90d8e383a2827d476809587898a1fbbc9581506)
1*a90d8e38SSatish Balay static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";
2*a90d8e38SSatish Balay 
3*a90d8e38SSatish Balay /*
4*a90d8e38SSatish Balay   This is a simple test to measure the latency of MPI communication.
5*a90d8e38SSatish Balay   The test is run with two processes.  The first process sends a message
6*a90d8e38SSatish Balay   to the second process, and after having received the message, the second
7*a90d8e38SSatish Balay   process sends a message back to the first process once.  The is repeated
8*a90d8e38SSatish Balay   a number of times.  The latency is defined as half time of the round-trip.
9*a90d8e38SSatish Balay 
10*a90d8e38SSatish Balay   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11*a90d8e38SSatish Balay 
12*a90d8e38SSatish Balay   Usage: mpirun -n 2 ./ex4k -mtype <type>
13*a90d8e38SSatish Balay   Other arguments have a default value that is also used in osu_latency.
14*a90d8e38SSatish Balay 
15*a90d8e38SSatish Balay   Examples:
16*a90d8e38SSatish Balay 
17*a90d8e38SSatish Balay   On Summit at OLCF:
18*a90d8e38SSatish Balay     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k  -mtype kokkos
19*a90d8e38SSatish Balay 
20*a90d8e38SSatish Balay   On Crusher at OLCF:
21*a90d8e38SSatish Balay     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos
22*a90d8e38SSatish Balay */
23*a90d8e38SSatish Balay #include <petscsf.h>
24*a90d8e38SSatish Balay #include <Kokkos_Core.hpp>
25*a90d8e38SSatish Balay 
26*a90d8e38SSatish Balay /* Same values as OSU microbenchmarks */
27*a90d8e38SSatish Balay #define LAT_LOOP_SMALL     10000
28*a90d8e38SSatish Balay #define LAT_SKIP_SMALL     100
29*a90d8e38SSatish Balay #define LAT_LOOP_LARGE     1000
30*a90d8e38SSatish Balay #define LAT_SKIP_LARGE     10
31*a90d8e38SSatish Balay #define LARGE_MESSAGE_SIZE 8192
32*a90d8e38SSatish Balay 
33*a90d8e38SSatish Balay int main(int argc, char **argv)
34*a90d8e38SSatish Balay {
35*a90d8e38SSatish Balay   PetscSF        sf[64];
36*a90d8e38SSatish Balay   PetscLogDouble t_start = 0, t_end = 0, time[64];
37*a90d8e38SSatish Balay   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
38*a90d8e38SSatish Balay   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
39*a90d8e38SSatish Balay   PetscSFNode   *iremote;
40*a90d8e38SSatish Balay   PetscMPIInt    rank, size;
41*a90d8e38SSatish Balay   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
42*a90d8e38SSatish Balay   size_t         msgsize;
43*a90d8e38SSatish Balay   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
44*a90d8e38SSatish Balay   char           mstring[16] = {0};
45*a90d8e38SSatish Balay   PetscBool      set;
46*a90d8e38SSatish Balay   PetscInt       skipSmall = -1, loopSmall = -1;
47*a90d8e38SSatish Balay   MPI_Op         op = MPI_REPLACE;
48*a90d8e38SSatish Balay 
49*a90d8e38SSatish Balay   PetscFunctionBeginUser;
50*a90d8e38SSatish Balay   Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
51*a90d8e38SSatish Balay   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
52*a90d8e38SSatish Balay   PetscCall(PetscKokkosInitializeCheck());
53*a90d8e38SSatish Balay 
54*a90d8e38SSatish Balay   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
55*a90d8e38SSatish Balay   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
56*a90d8e38SSatish Balay   PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");
57*a90d8e38SSatish Balay 
58*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
59*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
60*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));
61*a90d8e38SSatish Balay 
62*a90d8e38SSatish Balay   PetscCall(PetscMalloc1(maxn, &iremote));
63*a90d8e38SSatish Balay   PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
64*a90d8e38SSatish Balay   if (set) {
65*a90d8e38SSatish Balay     PetscBool isHost, isKokkos;
66*a90d8e38SSatish Balay     PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
67*a90d8e38SSatish Balay     PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
68*a90d8e38SSatish Balay     if (isHost) mtype = PETSC_MEMTYPE_HOST;
69*a90d8e38SSatish Balay     else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
70*a90d8e38SSatish Balay     else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
71*a90d8e38SSatish Balay   }
72*a90d8e38SSatish Balay 
73*a90d8e38SSatish Balay   if (mtype == PETSC_MEMTYPE_HOST) {
74*a90d8e38SSatish Balay     PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
75*a90d8e38SSatish Balay   } else {
76*a90d8e38SSatish Balay     PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
77*a90d8e38SSatish Balay     PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
78*a90d8e38SSatish Balay   }
79*a90d8e38SSatish Balay   PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
80*a90d8e38SSatish Balay   for (i = 0; i < maxn; i++) {
81*a90d8e38SSatish Balay     pbuf[i] = 123.0;
82*a90d8e38SSatish Balay     ebuf[i] = 456.0;
83*a90d8e38SSatish Balay   }
84*a90d8e38SSatish Balay 
85*a90d8e38SSatish Balay   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
86*a90d8e38SSatish Balay     PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
87*a90d8e38SSatish Balay     PetscCall(PetscSFSetFromOptions(sf[i]));
88*a90d8e38SSatish Balay     if (rank == 0) {
89*a90d8e38SSatish Balay       nroots  = n;
90*a90d8e38SSatish Balay       nleaves = 0;
91*a90d8e38SSatish Balay     } else {
92*a90d8e38SSatish Balay       nroots  = 0;
93*a90d8e38SSatish Balay       nleaves = n;
94*a90d8e38SSatish Balay       for (j = 0; j < nleaves; j++) {
95*a90d8e38SSatish Balay         iremote[j].rank  = 0;
96*a90d8e38SSatish Balay         iremote[j].index = j;
97*a90d8e38SSatish Balay       }
98*a90d8e38SSatish Balay     }
99*a90d8e38SSatish Balay     PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100*a90d8e38SSatish Balay     PetscCall(PetscSFSetUp(sf[i]));
101*a90d8e38SSatish Balay   }
102*a90d8e38SSatish Balay 
103*a90d8e38SSatish Balay   if (loopSmall > 0) {
104*a90d8e38SSatish Balay     nskip = skipSmall;
105*a90d8e38SSatish Balay     niter = loopSmall;
106*a90d8e38SSatish Balay   } else {
107*a90d8e38SSatish Balay     nskip = LAT_SKIP_SMALL;
108*a90d8e38SSatish Balay     niter = LAT_LOOP_SMALL;
109*a90d8e38SSatish Balay   }
110*a90d8e38SSatish Balay 
111*a90d8e38SSatish Balay   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112*a90d8e38SSatish Balay     msgsize = sizeof(PetscScalar) * n;
113*a90d8e38SSatish Balay     if (mtype == PETSC_MEMTYPE_HOST) {
114*a90d8e38SSatish Balay       PetscCall(PetscArraycpy(rootdata, pbuf, n));
115*a90d8e38SSatish Balay       PetscCall(PetscArraycpy(leafdata, ebuf, n));
116*a90d8e38SSatish Balay     } else {
117*a90d8e38SSatish Balay       Kokkos::View<PetscScalar *>                          dst1((PetscScalar *)rootdata, n);
118*a90d8e38SSatish Balay       Kokkos::View<PetscScalar *>                          dst2((PetscScalar *)leafdata, n);
119*a90d8e38SSatish Balay       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120*a90d8e38SSatish Balay       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121*a90d8e38SSatish Balay       PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122*a90d8e38SSatish Balay       PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123*a90d8e38SSatish Balay     }
124*a90d8e38SSatish Balay 
125*a90d8e38SSatish Balay     if (msgsize > LARGE_MESSAGE_SIZE) {
126*a90d8e38SSatish Balay       nskip = LAT_SKIP_LARGE;
127*a90d8e38SSatish Balay       niter = LAT_LOOP_LARGE;
128*a90d8e38SSatish Balay     }
129*a90d8e38SSatish Balay     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
130*a90d8e38SSatish Balay 
131*a90d8e38SSatish Balay     for (i = 0; i < niter + nskip; i++) {
132*a90d8e38SSatish Balay       if (i == nskip) {
133*a90d8e38SSatish Balay         PetscCallCXX(Kokkos::fence());
134*a90d8e38SSatish Balay         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135*a90d8e38SSatish Balay         t_start = MPI_Wtime();
136*a90d8e38SSatish Balay       }
137*a90d8e38SSatish Balay       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138*a90d8e38SSatish Balay       PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139*a90d8e38SSatish Balay       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140*a90d8e38SSatish Balay       PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141*a90d8e38SSatish Balay     }
142*a90d8e38SSatish Balay     PetscCallCXX(Kokkos::fence());
143*a90d8e38SSatish Balay     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144*a90d8e38SSatish Balay     t_end   = MPI_Wtime();
145*a90d8e38SSatish Balay     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146*a90d8e38SSatish Balay   }
147*a90d8e38SSatish Balay 
148*a90d8e38SSatish Balay   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149*a90d8e38SSatish Balay   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150*a90d8e38SSatish Balay     PetscCall(PetscSFDestroy(&sf[j]));
151*a90d8e38SSatish Balay     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152*a90d8e38SSatish Balay   }
153*a90d8e38SSatish Balay   PetscCall(PetscFree2(pbuf, ebuf));
154*a90d8e38SSatish Balay   if (mtype == PETSC_MEMTYPE_HOST) {
155*a90d8e38SSatish Balay     PetscCall(PetscFree2(rootdata, leafdata));
156*a90d8e38SSatish Balay   } else {
157*a90d8e38SSatish Balay     PetscCallCXX(Kokkos::kokkos_free(rootdata));
158*a90d8e38SSatish Balay     PetscCallCXX(Kokkos::kokkos_free(leafdata));
159*a90d8e38SSatish Balay   }
160*a90d8e38SSatish Balay   PetscCall(PetscFree(iremote));
161*a90d8e38SSatish Balay   PetscCall(PetscFinalize());
162*a90d8e38SSatish Balay   Kokkos::finalize();
163*a90d8e38SSatish Balay   return 0;
164*a90d8e38SSatish Balay }
165*a90d8e38SSatish Balay 
166*a90d8e38SSatish Balay /*TEST
167*a90d8e38SSatish Balay   testset:
168*a90d8e38SSatish Balay     requires: kokkos
169*a90d8e38SSatish Balay     # use small numbers to make the test cheap
170*a90d8e38SSatish Balay     args: -maxn 4 -skipSmall 1 -loopSmall 1
171*a90d8e38SSatish Balay     filter: grep "DOES_NOT_EXIST"
172*a90d8e38SSatish Balay     output_file: output/empty.out
173*a90d8e38SSatish Balay     nsize: 2
174*a90d8e38SSatish Balay 
175*a90d8e38SSatish Balay     test:
176*a90d8e38SSatish Balay       args: -mtype {{host kokkos}}
177*a90d8e38SSatish Balay 
178*a90d8e38SSatish Balay     test:
179*a90d8e38SSatish Balay       requires: cuda mpi_gpu_aware mpix_stream
180*a90d8e38SSatish Balay       suffix: mpix
181*a90d8e38SSatish Balay       # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU
182*a90d8e38SSatish Balay       env: MPIR_CVAR_CH4_RESERVE_VCIS=1
183*a90d8e38SSatish Balay       args: -mtype kokkos -sf_use_stream_aware_mpi 1
184*a90d8e38SSatish Balay 
185*a90d8e38SSatish Balay TEST*/
186