1a90d8e38SSatish Balay static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n"; 2a90d8e38SSatish Balay 3a90d8e38SSatish Balay /* 4a90d8e38SSatish Balay This is a simple test to measure the latency of MPI communication. 5a90d8e38SSatish Balay The test is run with two processes. The first process sends a message 6a90d8e38SSatish Balay to the second process, and after having received the message, the second 7a90d8e38SSatish Balay process sends a message back to the first process once. The is repeated 8a90d8e38SSatish Balay a number of times. The latency is defined as half time of the round-trip. 9a90d8e38SSatish Balay 10a90d8e38SSatish Balay It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/). 11a90d8e38SSatish Balay 12*a8cf87e0SJunchao Zhang Usage: mpiexec -n 2 ./ex4k -mtype <type> 13a90d8e38SSatish Balay Other arguments have a default value that is also used in osu_latency. 14a90d8e38SSatish Balay 15a90d8e38SSatish Balay Examples: 16a90d8e38SSatish Balay 17a90d8e38SSatish Balay On Summit at OLCF: 18a90d8e38SSatish Balay jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex4k -mtype kokkos 19a90d8e38SSatish Balay 20a90d8e38SSatish Balay On Crusher at OLCF: 21a90d8e38SSatish Balay srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex4k -mtype kokkos 22a90d8e38SSatish Balay */ 23a90d8e38SSatish Balay #include <petscsf.h> 24a90d8e38SSatish Balay #include <Kokkos_Core.hpp> 25a90d8e38SSatish Balay 26a90d8e38SSatish Balay /* Same values as OSU microbenchmarks */ 27a90d8e38SSatish Balay #define LAT_LOOP_SMALL 10000 28a90d8e38SSatish Balay #define LAT_SKIP_SMALL 100 29a90d8e38SSatish Balay #define LAT_LOOP_LARGE 1000 30a90d8e38SSatish Balay #define LAT_SKIP_LARGE 10 31a90d8e38SSatish Balay #define LARGE_MESSAGE_SIZE 8192 32a90d8e38SSatish Balay 33a90d8e38SSatish Balay int main(int argc, char **argv) 34a90d8e38SSatish Balay { 35a90d8e38SSatish Balay PetscSF sf[64]; 36a90d8e38SSatish Balay PetscLogDouble t_start = 0, t_end = 0, time[64]; 37a90d8e38SSatish Balay PetscInt i, j, n, nroots, nleaves, niter = 100, nskip = 10; 38a90d8e38SSatish Balay PetscInt maxn = 512 * 1024; /* max 4M bytes messages */ 39a90d8e38SSatish Balay PetscSFNode *iremote; 40a90d8e38SSatish Balay PetscMPIInt rank, size; 41a90d8e38SSatish Balay PetscScalar *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf; 42a90d8e38SSatish Balay size_t msgsize; 43a90d8e38SSatish Balay PetscMemType mtype = PETSC_MEMTYPE_HOST; 44a90d8e38SSatish Balay char mstring[16] = {0}; 45a90d8e38SSatish Balay PetscBool set; 46a90d8e38SSatish Balay PetscInt skipSmall = -1, loopSmall = -1; 47a90d8e38SSatish Balay MPI_Op op = MPI_REPLACE; 48a90d8e38SSatish Balay 49a90d8e38SSatish Balay PetscFunctionBeginUser; 50a90d8e38SSatish Balay Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc 51a90d8e38SSatish Balay PetscCall(PetscInitialize(&argc, &argv, NULL, help)); 52a90d8e38SSatish Balay PetscCall(PetscKokkosInitializeCheck()); 53a90d8e38SSatish Balay 54a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 55a90d8e38SSatish Balay PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank)); 56a90d8e38SSatish Balay PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes"); 57a90d8e38SSatish Balay 58a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */ 59a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL)); 60a90d8e38SSatish Balay PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL)); 61a90d8e38SSatish Balay 62a90d8e38SSatish Balay PetscCall(PetscMalloc1(maxn, &iremote)); 63a90d8e38SSatish Balay PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set)); 64a90d8e38SSatish Balay if (set) { 65a90d8e38SSatish Balay PetscBool isHost, isKokkos; 66a90d8e38SSatish Balay PetscCall(PetscStrcasecmp(mstring, "host", &isHost)); 67a90d8e38SSatish Balay PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos)); 68a90d8e38SSatish Balay if (isHost) mtype = PETSC_MEMTYPE_HOST; 69a90d8e38SSatish Balay else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS; 70a90d8e38SSatish Balay else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring); 71a90d8e38SSatish Balay } 72a90d8e38SSatish Balay 73a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) { 74a90d8e38SSatish Balay PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata)); 75a90d8e38SSatish Balay } else { 76a90d8e38SSatish Balay PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn)); 77a90d8e38SSatish Balay PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn)); 78a90d8e38SSatish Balay } 79a90d8e38SSatish Balay PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf)); 80a90d8e38SSatish Balay for (i = 0; i < maxn; i++) { 81a90d8e38SSatish Balay pbuf[i] = 123.0; 82a90d8e38SSatish Balay ebuf[i] = 456.0; 83a90d8e38SSatish Balay } 84a90d8e38SSatish Balay 85a90d8e38SSatish Balay for (n = 1, i = 0; n <= maxn; n *= 2, i++) { 86a90d8e38SSatish Balay PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i])); 87a90d8e38SSatish Balay PetscCall(PetscSFSetFromOptions(sf[i])); 88a90d8e38SSatish Balay if (rank == 0) { 89a90d8e38SSatish Balay nroots = n; 90a90d8e38SSatish Balay nleaves = 0; 91a90d8e38SSatish Balay } else { 92a90d8e38SSatish Balay nroots = 0; 93a90d8e38SSatish Balay nleaves = n; 94a90d8e38SSatish Balay for (j = 0; j < nleaves; j++) { 95a90d8e38SSatish Balay iremote[j].rank = 0; 96a90d8e38SSatish Balay iremote[j].index = j; 97a90d8e38SSatish Balay } 98a90d8e38SSatish Balay } 99a90d8e38SSatish Balay PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES)); 100a90d8e38SSatish Balay PetscCall(PetscSFSetUp(sf[i])); 101a90d8e38SSatish Balay } 102a90d8e38SSatish Balay 103a90d8e38SSatish Balay if (loopSmall > 0) { 104a90d8e38SSatish Balay nskip = skipSmall; 105a90d8e38SSatish Balay niter = loopSmall; 106a90d8e38SSatish Balay } else { 107a90d8e38SSatish Balay nskip = LAT_SKIP_SMALL; 108a90d8e38SSatish Balay niter = LAT_LOOP_SMALL; 109a90d8e38SSatish Balay } 110a90d8e38SSatish Balay 111a90d8e38SSatish Balay for (n = 1, j = 0; n <= maxn; n *= 2, j++) { 112a90d8e38SSatish Balay msgsize = sizeof(PetscScalar) * n; 113a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) { 114a90d8e38SSatish Balay PetscCall(PetscArraycpy(rootdata, pbuf, n)); 115a90d8e38SSatish Balay PetscCall(PetscArraycpy(leafdata, ebuf, n)); 116a90d8e38SSatish Balay } else { 117a90d8e38SSatish Balay Kokkos::View<PetscScalar *> dst1((PetscScalar *)rootdata, n); 118a90d8e38SSatish Balay Kokkos::View<PetscScalar *> dst2((PetscScalar *)leafdata, n); 119a90d8e38SSatish Balay Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n); 120a90d8e38SSatish Balay Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n); 121a90d8e38SSatish Balay PetscCallCXX(Kokkos::deep_copy(dst1, src1)); 122a90d8e38SSatish Balay PetscCallCXX(Kokkos::deep_copy(dst2, src2)); 123a90d8e38SSatish Balay } 124a90d8e38SSatish Balay 125a90d8e38SSatish Balay if (msgsize > LARGE_MESSAGE_SIZE) { 126a90d8e38SSatish Balay nskip = LAT_SKIP_LARGE; 127a90d8e38SSatish Balay niter = LAT_LOOP_LARGE; 128a90d8e38SSatish Balay } 129a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 130a90d8e38SSatish Balay 131a90d8e38SSatish Balay for (i = 0; i < niter + nskip; i++) { 132a90d8e38SSatish Balay if (i == nskip) { 133a90d8e38SSatish Balay PetscCallCXX(Kokkos::fence()); 134a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 135a90d8e38SSatish Balay t_start = MPI_Wtime(); 136a90d8e38SSatish Balay } 137a90d8e38SSatish Balay PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op)); 138a90d8e38SSatish Balay PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op)); 139a90d8e38SSatish Balay PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op)); 140a90d8e38SSatish Balay PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op)); 141a90d8e38SSatish Balay } 142a90d8e38SSatish Balay PetscCallCXX(Kokkos::fence()); 143a90d8e38SSatish Balay PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 144a90d8e38SSatish Balay t_end = MPI_Wtime(); 145a90d8e38SSatish Balay time[j] = (t_end - t_start) * 1e6 / (niter * 2); 146a90d8e38SSatish Balay } 147a90d8e38SSatish Balay 148a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device")); 149a90d8e38SSatish Balay for (n = 1, j = 0; n <= maxn; n *= 2, j++) { 150a90d8e38SSatish Balay PetscCall(PetscSFDestroy(&sf[j])); 151a90d8e38SSatish Balay PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j])); 152a90d8e38SSatish Balay } 153a90d8e38SSatish Balay PetscCall(PetscFree2(pbuf, ebuf)); 154a90d8e38SSatish Balay if (mtype == PETSC_MEMTYPE_HOST) { 155a90d8e38SSatish Balay PetscCall(PetscFree2(rootdata, leafdata)); 156a90d8e38SSatish Balay } else { 157a90d8e38SSatish Balay PetscCallCXX(Kokkos::kokkos_free(rootdata)); 158a90d8e38SSatish Balay PetscCallCXX(Kokkos::kokkos_free(leafdata)); 159a90d8e38SSatish Balay } 160a90d8e38SSatish Balay PetscCall(PetscFree(iremote)); 161a90d8e38SSatish Balay PetscCall(PetscFinalize()); 162a90d8e38SSatish Balay Kokkos::finalize(); 163a90d8e38SSatish Balay return 0; 164a90d8e38SSatish Balay } 165a90d8e38SSatish Balay 166a90d8e38SSatish Balay /*TEST 167a90d8e38SSatish Balay testset: 168a90d8e38SSatish Balay requires: kokkos 169a90d8e38SSatish Balay # use small numbers to make the test cheap 170a90d8e38SSatish Balay args: -maxn 4 -skipSmall 1 -loopSmall 1 171a90d8e38SSatish Balay filter: grep "DOES_NOT_EXIST" 172a90d8e38SSatish Balay output_file: output/empty.out 173a90d8e38SSatish Balay nsize: 2 174a90d8e38SSatish Balay 175a90d8e38SSatish Balay test: 176a90d8e38SSatish Balay args: -mtype {{host kokkos}} 177a90d8e38SSatish Balay 178a90d8e38SSatish Balay test: 179a90d8e38SSatish Balay requires: cuda mpi_gpu_aware mpix_stream 180a90d8e38SSatish Balay suffix: mpix 181a90d8e38SSatish Balay # MPICH doesn't reserve VCI, and per MPICH developers only 1 VCI is needed for GPU 182a90d8e38SSatish Balay env: MPIR_CVAR_CH4_RESERVE_VCIS=1 183a90d8e38SSatish Balay args: -mtype kokkos -sf_use_stream_aware_mpi 1 184a90d8e38SSatish Balay 185a90d8e38SSatish Balay TEST*/ 186