1*a90d8e38SSatish Balay static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n"; 2*a90d8e38SSatish Balay 3*a90d8e38SSatish Balay /* 4*a90d8e38SSatish Balay This is a simple test to measure the latency of MPI communication. 5*a90d8e38SSatish Balay The test is run with two processes. The first process sends a message 6*a90d8e38SSatish Balay to the second process, and after having received the message, the second 7*a90d8e38SSatish Balay process sends a message back to the first process once. The is repeated 8*a90d8e38SSatish Balay a number of times. The latency is defined as half time of the round-trip. 9*a90d8e38SSatish Balay 10*a90d8e38SSatish Balay It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/). 11*a90d8e38SSatish Balay 12*a90d8e38SSatish Balay Usage: mpirun -n 2 ./ex4k -mtype <type> 13*a90d8e38SSatish Balay Other arguments have a default value that is also used in osu_latency. 14*a90d8e38SSatish Balay 15*a90d8e38SSatish Balay Examples: 16*a90d8e38SSatish Balay 17*a90d8e38SSatish Balay On Summit at OLCF: 18*a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k -mtype kokkos 19*a90d8e38SSatish Balay 20*a90d8e38SSatish Balay On Crusher at OLCF: 21*a90d8e38SSatish Balay srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos 22*a90d8e38SSatish Balay */ 23*a90d8e38SSatish Balay #include <petscsf.h> 24*a90d8e38SSatish Balay #include <Kokkos_Core.hpp> 25*a90d8e38SSatish Balay 26*a90d8e38SSatish Balay /* Same values as OSU microbenchmarks */ 27*a90d8e38SSatish Balay #define LAT_LOOP_SMALL 10000 28*a90d8e38SSatish Balay #define LAT_SKIP_SMALL 100 29*a90d8e38SSatish Balay #define LAT_LOOP_LARGE 1000 30*a90d8e38SSatish Balay #define LAT_SKIP_LARGE 10 31*a90d8e38SSatish Balay #define LARGE_MESSAGE_SIZE 8192 32*a90d8e38SSatish Balay 33*a90d8e38SSatish Balay int main(int argc, char **argv) 34*a90d8e38SSatish Balay { 35*a90d8e38SSatish Balay PetscSF sf[64]; 36*a90d8e38SSatish Balay PetscLogDouble t_start = 0, t_end = 0, time[64]; 37*a90d8e38SSatish Balay PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10; 38*a90d8e38SSatish Balay PetscInt maxn = 512 * 1024; /* max 4M bytes messages */ 39*a90d8e38SSatish Balay PetscSFNode *iremote; 40*a90d8e38SSatish Balay PetscMPIInt rank, size; 41*a90d8e38SSatish Balay PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf; 42*a90d8e38SSatish Balay size_t msgsize; 43*a90d8e38SSatish Balay PetscMemType mtype = PETSC_MEMTYPE_HOST; 44*a90d8e38SSatish Balay char mstring[16] = {0}; 45*a90d8e38SSatish Balay PetscBool set; 46*a90d8e38SSatish Balay PetscInt skipSmall = -1, loopSmall = -1; 47*a90d8e38SSatish Balay MPI_Op op = MPI_REPLACE; 48*a90d8e38SSatish Balay 49*a90d8e38SSatish Balay PetscFunctionBeginUser; 50*a90d8e38SSatish Balay Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc 51*a90d8e38SSatish Balay PetscCall(PetscInitialize(&argc, &argv, NULL, help)); 52*a90d8e38SSatish Balay PetscCall(PetscKokkosInitializeCheck()); 53*a90d8e38SSatish Balay 54*a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 55*a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank)); 56*a90d8e38SSatish Balay PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes"); 57*a90d8e38SSatish Balay 58*a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */ 59*a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL)); 60*a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL)); 61*a90d8e38SSatish Balay 62*a90d8e38SSatish Balay PetscCall(PetscMalloc1(maxn, &iremote)); 63*a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set)); 64*a90d8e38SSatish Balay if (set) { 65*a90d8e38SSatish Balay PetscBool isHost, isKokkos; 66*a90d8e38SSatish Balay PetscCall(PetscStrcasecmp(mstring, "host", &isHost)); 67*a90d8e38SSatish Balay PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos)); 68*a90d8e38SSatish Balay if (isHost) mtype = PETSC_MEMTYPE_HOST; 69*a90d8e38SSatish Balay else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS; 70*a90d8e38SSatish Balay else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring); 71*a90d8e38SSatish Balay } 72*a90d8e38SSatish Balay 73*a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) { 74*a90d8e38SSatish Balay PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata)); 75*a90d8e38SSatish Balay } else { 76*a90d8e38SSatish Balay PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn)); 77*a90d8e38SSatish Balay PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn)); 78*a90d8e38SSatish Balay } 79*a90d8e38SSatish Balay PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf)); 80*a90d8e38SSatish Balay for (i = 0; i < maxn; i++) { 81*a90d8e38SSatish Balay pbuf[i] = 123.0; 82*a90d8e38SSatish Balay ebuf[i] = 456.0; 83*a90d8e38SSatish Balay } 84*a90d8e38SSatish Balay 85*a90d8e38SSatish Balay for (n = 1, i = 0; n <= maxn; n *= 2, i++) { 86*a90d8e38SSatish Balay PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i])); 87*a90d8e38SSatish Balay PetscCall(PetscSFSetFromOptions(sf[i])); 88*a90d8e38SSatish Balay if (rank == 0) { 89*a90d8e38SSatish Balay nroots = n; 90*a90d8e38SSatish Balay nleaves = 0; 91*a90d8e38SSatish Balay } else { 92*a90d8e38SSatish Balay nroots = 0; 93*a90d8e38SSatish Balay nleaves = n; 94*a90d8e38SSatish Balay for (j = 0; j < nleaves; j++) { 95*a90d8e38SSatish Balay iremote[j].rank = 0; 96*a90d8e38SSatish Balay iremote[j].index = j; 97*a90d8e38SSatish Balay } 98*a90d8e38SSatish Balay } 99*a90d8e38SSatish Balay PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES)); 100*a90d8e38SSatish Balay PetscCall(PetscSFSetUp(sf[i])); 101*a90d8e38SSatish Balay } 102*a90d8e38SSatish Balay 103*a90d8e38SSatish Balay if (loopSmall > 0) { 104*a90d8e38SSatish Balay nskip = skipSmall; 105*a90d8e38SSatish Balay niter = loopSmall; 106*a90d8e38SSatish Balay } else { 107*a90d8e38SSatish Balay nskip = LAT_SKIP_SMALL; 108*a90d8e38SSatish Balay niter = LAT_LOOP_SMALL; 109*a90d8e38SSatish Balay } 110*a90d8e38SSatish Balay 111*a90d8e38SSatish Balay for (n = 1, j = 0; n <= maxn; n *= 2, j++) { 112*a90d8e38SSatish Balay msgsize = sizeof(PetscScalar) * n; 113*a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) { 114*a90d8e38SSatish Balay PetscCall(PetscArraycpy(rootdata, pbuf, n)); 115*a90d8e38SSatish Balay PetscCall(PetscArraycpy(leafdata, ebuf, n)); 116*a90d8e38SSatish Balay } else { 117*a90d8e38SSatish Balay Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n); 118*a90d8e38SSatish Balay Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n); 119*a90d8e38SSatish Balay Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n); 120*a90d8e38SSatish Balay Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n); 121*a90d8e38SSatish Balay PetscCallCXX(Kokkos::deep_copy(dst1, src1)); 122*a90d8e38SSatish Balay PetscCallCXX(Kokkos::deep_copy(dst2, src2)); 123*a90d8e38SSatish Balay } 124*a90d8e38SSatish Balay 125*a90d8e38SSatish Balay if (msgsize > LARGE_MESSAGE_SIZE) { 126*a90d8e38SSatish Balay nskip = LAT_SKIP_LARGE; 127*a90d8e38SSatish Balay niter = LAT_LOOP_LARGE; 128*a90d8e38SSatish Balay } 129*a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 130*a90d8e38SSatish Balay 131*a90d8e38SSatish Balay for (i = 0; i < niter + nskip; i++) { 132*a90d8e38SSatish Balay if (i == nskip) { 133*a90d8e38SSatish Balay PetscCallCXX(Kokkos::fence()); 134*a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 135*a90d8e38SSatish Balay t_start = MPI_Wtime(); 136*a90d8e38SSatish Balay } 137*a90d8e38SSatish Balay PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op)); 138*a90d8e38SSatish Balay PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op)); 139*a90d8e38SSatish Balay PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op)); 140*a90d8e38SSatish Balay PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op)); 141*a90d8e38SSatish Balay } 142*a90d8e38SSatish Balay PetscCallCXX(Kokkos::fence()); 143*a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 144*a90d8e38SSatish Balay t_end = MPI_Wtime(); 145*a90d8e38SSatish Balay time[j] = (t_end - t_start) * 1e6 / (niter * 2); 146*a90d8e38SSatish Balay } 147*a90d8e38SSatish Balay 148*a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device")); 149*a90d8e38SSatish Balay for (n = 1, j = 0; n <= maxn; n *= 2, j++) { 150*a90d8e38SSatish Balay PetscCall(PetscSFDestroy(&sf[j])); 151*a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j])); 152*a90d8e38SSatish Balay } 153*a90d8e38SSatish Balay PetscCall(PetscFree2(pbuf, ebuf)); 154*a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) { 155*a90d8e38SSatish Balay PetscCall(PetscFree2(rootdata, leafdata)); 156*a90d8e38SSatish Balay } else { 157*a90d8e38SSatish Balay PetscCallCXX(Kokkos::kokkos_free(rootdata)); 158*a90d8e38SSatish Balay PetscCallCXX(Kokkos::kokkos_free(leafdata)); 159*a90d8e38SSatish Balay } 160*a90d8e38SSatish Balay PetscCall(PetscFree(iremote)); 161*a90d8e38SSatish Balay PetscCall(PetscFinalize()); 162*a90d8e38SSatish Balay Kokkos::finalize(); 163*a90d8e38SSatish Balay return 0; 164*a90d8e38SSatish Balay } 165*a90d8e38SSatish Balay 166*a90d8e38SSatish Balay /*TEST 167*a90d8e38SSatish Balay testset: 168*a90d8e38SSatish Balay requires: kokkos 169*a90d8e38SSatish Balay # use small numbers to make the test cheap 170*a90d8e38SSatish Balay args: -maxn 4 -skipSmall 1 -loopSmall 1 171*a90d8e38SSatish Balay filter: grep "DOES_NOT_EXIST" 172*a90d8e38SSatish Balay output_file: output/empty.out 173*a90d8e38SSatish Balay nsize: 2 174*a90d8e38SSatish Balay 175*a90d8e38SSatish Balay test: 176*a90d8e38SSatish Balay args: -mtype {{host kokkos}} 177*a90d8e38SSatish Balay 178*a90d8e38SSatish Balay test: 179*a90d8e38SSatish Balay requires: cuda mpi_gpu_aware mpix_stream 180*a90d8e38SSatish Balay suffix: mpix 181*a90d8e38SSatish Balay # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU 182*a90d8e38SSatish Balay env: MPIR_CVAR_CH4_RESERVE_VCIS=1 183*a90d8e38SSatish Balay args: -mtype kokkos -sf_use_stream_aware_mpi 1 184*a90d8e38SSatish Balay 185*a90d8e38SSatish Balay TEST*/ 186