171438e86SJunchao Zhang #include <petsc/private/cudavecimpl.h> 271438e86SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 371438e86SJunchao Zhang #include <mpi.h> 471438e86SJunchao Zhang #include <nvshmem.h> 571438e86SJunchao Zhang #include <nvshmemx.h> 671438e86SJunchao Zhang 79371c9d4SSatish Balay PetscErrorCode PetscNvshmemInitializeCheck(void) { 871438e86SJunchao Zhang PetscFunctionBegin; 971438e86SJunchao Zhang if (!PetscNvshmemInitialized) { /* Note NVSHMEM does not provide a routine to check whether it is initialized */ 1071438e86SJunchao Zhang nvshmemx_init_attr_t attr; 1171438e86SJunchao Zhang attr.mpi_comm = &PETSC_COMM_WORLD; 129566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 139566063dSJacob Faibussowitsch PetscCall(nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr)); 1471438e86SJunchao Zhang PetscNvshmemInitialized = PETSC_TRUE; 1571438e86SJunchao Zhang PetscBeganNvshmem = PETSC_TRUE; 1671438e86SJunchao Zhang } 1771438e86SJunchao Zhang PetscFunctionReturn(0); 1871438e86SJunchao Zhang } 1971438e86SJunchao Zhang 209371c9d4SSatish Balay PetscErrorCode PetscNvshmemMalloc(size_t size, void **ptr) { 2171438e86SJunchao Zhang PetscFunctionBegin; 229566063dSJacob Faibussowitsch PetscCall(PetscNvshmemInitializeCheck()); 2371438e86SJunchao Zhang *ptr = nvshmem_malloc(size); 2408401ef6SPierre Jolivet PetscCheck(*ptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "nvshmem_malloc() failed to allocate %zu bytes", size); 2571438e86SJunchao Zhang PetscFunctionReturn(0); 2671438e86SJunchao Zhang } 2771438e86SJunchao Zhang 289371c9d4SSatish Balay PetscErrorCode PetscNvshmemCalloc(size_t size, void **ptr) { 2971438e86SJunchao Zhang PetscFunctionBegin; 309566063dSJacob Faibussowitsch PetscCall(PetscNvshmemInitializeCheck()); 3171438e86SJunchao Zhang *ptr = nvshmem_calloc(size, 1); 3208401ef6SPierre Jolivet PetscCheck(*ptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "nvshmem_calloc() failed to allocate %zu bytes", size); 3371438e86SJunchao Zhang PetscFunctionReturn(0); 3471438e86SJunchao Zhang } 3571438e86SJunchao Zhang 369371c9d4SSatish Balay PetscErrorCode PetscNvshmemFree_Private(void *ptr) { 3771438e86SJunchao Zhang PetscFunctionBegin; 3871438e86SJunchao Zhang nvshmem_free(ptr); 3971438e86SJunchao Zhang PetscFunctionReturn(0); 4071438e86SJunchao Zhang } 4171438e86SJunchao Zhang 429371c9d4SSatish Balay PetscErrorCode PetscNvshmemFinalize(void) { 4371438e86SJunchao Zhang PetscFunctionBegin; 4471438e86SJunchao Zhang nvshmem_finalize(); 4571438e86SJunchao Zhang PetscFunctionReturn(0); 4671438e86SJunchao Zhang } 4771438e86SJunchao Zhang 4871438e86SJunchao Zhang /* Free nvshmem related fields in the SF */ 499371c9d4SSatish Balay PetscErrorCode PetscSFReset_Basic_NVSHMEM(PetscSF sf) { 5071438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 5171438e86SJunchao Zhang 5271438e86SJunchao Zhang PetscFunctionBegin; 539566063dSJacob Faibussowitsch PetscCall(PetscFree2(bas->leafsigdisp, bas->leafbufdisp)); 549566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, bas->leafbufdisp_d)); 559566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, bas->leafsigdisp_d)); 569566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, bas->iranks_d)); 579566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, bas->ioffset_d)); 5871438e86SJunchao Zhang 599566063dSJacob Faibussowitsch PetscCall(PetscFree2(sf->rootsigdisp, sf->rootbufdisp)); 609566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, sf->rootbufdisp_d)); 619566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, sf->rootsigdisp_d)); 629566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, sf->ranks_d)); 639566063dSJacob Faibussowitsch PetscCall(PetscSFFree(sf, PETSC_MEMTYPE_CUDA, sf->roffset_d)); 6471438e86SJunchao Zhang PetscFunctionReturn(0); 6571438e86SJunchao Zhang } 6671438e86SJunchao Zhang 6771438e86SJunchao Zhang /* Set up NVSHMEM related fields for an SF of type SFBASIC (only after PetscSFSetup_Basic() already set up dependant fields */ 689371c9d4SSatish Balay static PetscErrorCode PetscSFSetUp_Basic_NVSHMEM(PetscSF sf) { 6971438e86SJunchao Zhang cudaError_t cerr; 7071438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 7171438e86SJunchao Zhang PetscInt i, nRemoteRootRanks, nRemoteLeafRanks; 7271438e86SJunchao Zhang PetscMPIInt tag; 7371438e86SJunchao Zhang MPI_Comm comm; 7471438e86SJunchao Zhang MPI_Request *rootreqs, *leafreqs; 7571438e86SJunchao Zhang PetscInt tmp, stmp[4], rtmp[4]; /* tmps for send/recv buffers */ 7671438e86SJunchao Zhang 7771438e86SJunchao Zhang PetscFunctionBegin; 789566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)sf, &comm)); 799566063dSJacob Faibussowitsch PetscCall(PetscObjectGetNewTag((PetscObject)sf, &tag)); 8071438e86SJunchao Zhang 8171438e86SJunchao Zhang nRemoteRootRanks = sf->nranks - sf->ndranks; 8271438e86SJunchao Zhang nRemoteLeafRanks = bas->niranks - bas->ndiranks; 8371438e86SJunchao Zhang sf->nRemoteRootRanks = nRemoteRootRanks; 8471438e86SJunchao Zhang bas->nRemoteLeafRanks = nRemoteLeafRanks; 8571438e86SJunchao Zhang 869566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nRemoteLeafRanks, &rootreqs, nRemoteRootRanks, &leafreqs)); 8771438e86SJunchao Zhang 8871438e86SJunchao Zhang stmp[0] = nRemoteRootRanks; 8971438e86SJunchao Zhang stmp[1] = sf->leafbuflen[PETSCSF_REMOTE]; 9071438e86SJunchao Zhang stmp[2] = nRemoteLeafRanks; 9171438e86SJunchao Zhang stmp[3] = bas->rootbuflen[PETSCSF_REMOTE]; 9271438e86SJunchao Zhang 931c2dc1cbSBarry Smith PetscCall(MPIU_Allreduce(stmp, rtmp, 4, MPIU_INT, MPI_MAX, comm)); 9471438e86SJunchao Zhang 9571438e86SJunchao Zhang sf->nRemoteRootRanksMax = rtmp[0]; 9671438e86SJunchao Zhang sf->leafbuflen_rmax = rtmp[1]; 9771438e86SJunchao Zhang bas->nRemoteLeafRanksMax = rtmp[2]; 9871438e86SJunchao Zhang bas->rootbuflen_rmax = rtmp[3]; 9971438e86SJunchao Zhang 10071438e86SJunchao Zhang /* Total four rounds of MPI communications to set up the nvshmem fields */ 10171438e86SJunchao Zhang 10271438e86SJunchao Zhang /* Root ranks to leaf ranks: send info about rootsigdisp[] and rootbufdisp[] */ 1039566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nRemoteRootRanks, &sf->rootsigdisp, nRemoteRootRanks, &sf->rootbufdisp)); 1049566063dSJacob Faibussowitsch for (i = 0; i < nRemoteRootRanks; i++) PetscCallMPI(MPI_Irecv(&sf->rootsigdisp[i], 1, MPIU_INT, sf->ranks[i + sf->ndranks], tag, comm, &leafreqs[i])); /* Leaves recv */ 1059566063dSJacob Faibussowitsch for (i = 0; i < nRemoteLeafRanks; i++) PetscCallMPI(MPI_Send(&i, 1, MPIU_INT, bas->iranks[i + bas->ndiranks], tag, comm)); /* Roots send. Note i changes, so we use MPI_Send. */ 1069566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nRemoteRootRanks, leafreqs, MPI_STATUSES_IGNORE)); 10771438e86SJunchao Zhang 1089566063dSJacob Faibussowitsch for (i = 0; i < nRemoteRootRanks; i++) PetscCallMPI(MPI_Irecv(&sf->rootbufdisp[i], 1, MPIU_INT, sf->ranks[i + sf->ndranks], tag, comm, &leafreqs[i])); /* Leaves recv */ 10971438e86SJunchao Zhang for (i = 0; i < nRemoteLeafRanks; i++) { 11071438e86SJunchao Zhang tmp = bas->ioffset[i + bas->ndiranks] - bas->ioffset[bas->ndiranks]; 1119566063dSJacob Faibussowitsch PetscCallMPI(MPI_Send(&tmp, 1, MPIU_INT, bas->iranks[i + bas->ndiranks], tag, comm)); /* Roots send. Note tmp changes, so we use MPI_Send. */ 11271438e86SJunchao Zhang } 1139566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nRemoteRootRanks, leafreqs, MPI_STATUSES_IGNORE)); 11471438e86SJunchao Zhang 1159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&sf->rootbufdisp_d, nRemoteRootRanks * sizeof(PetscInt))); 1169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&sf->rootsigdisp_d, nRemoteRootRanks * sizeof(PetscInt))); 1179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&sf->ranks_d, nRemoteRootRanks * sizeof(PetscMPIInt))); 1189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&sf->roffset_d, (nRemoteRootRanks + 1) * sizeof(PetscInt))); 11971438e86SJunchao Zhang 1209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(sf->rootbufdisp_d, sf->rootbufdisp, nRemoteRootRanks * sizeof(PetscInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 1219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(sf->rootsigdisp_d, sf->rootsigdisp, nRemoteRootRanks * sizeof(PetscInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 1229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(sf->ranks_d, sf->ranks + sf->ndranks, nRemoteRootRanks * sizeof(PetscMPIInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 1239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(sf->roffset_d, sf->roffset + sf->ndranks, (nRemoteRootRanks + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 12471438e86SJunchao Zhang 12571438e86SJunchao Zhang /* Leaf ranks to root ranks: send info about leafsigdisp[] and leafbufdisp[] */ 1269566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nRemoteLeafRanks, &bas->leafsigdisp, nRemoteLeafRanks, &bas->leafbufdisp)); 1279566063dSJacob Faibussowitsch for (i = 0; i < nRemoteLeafRanks; i++) PetscCallMPI(MPI_Irecv(&bas->leafsigdisp[i], 1, MPIU_INT, bas->iranks[i + bas->ndiranks], tag, comm, &rootreqs[i])); 1289566063dSJacob Faibussowitsch for (i = 0; i < nRemoteRootRanks; i++) PetscCallMPI(MPI_Send(&i, 1, MPIU_INT, sf->ranks[i + sf->ndranks], tag, comm)); 1299566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nRemoteLeafRanks, rootreqs, MPI_STATUSES_IGNORE)); 13071438e86SJunchao Zhang 1319566063dSJacob Faibussowitsch for (i = 0; i < nRemoteLeafRanks; i++) PetscCallMPI(MPI_Irecv(&bas->leafbufdisp[i], 1, MPIU_INT, bas->iranks[i + bas->ndiranks], tag, comm, &rootreqs[i])); 13271438e86SJunchao Zhang for (i = 0; i < nRemoteRootRanks; i++) { 13371438e86SJunchao Zhang tmp = sf->roffset[i + sf->ndranks] - sf->roffset[sf->ndranks]; 1349566063dSJacob Faibussowitsch PetscCallMPI(MPI_Send(&tmp, 1, MPIU_INT, sf->ranks[i + sf->ndranks], tag, comm)); 13571438e86SJunchao Zhang } 1369566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(nRemoteLeafRanks, rootreqs, MPI_STATUSES_IGNORE)); 13771438e86SJunchao Zhang 1389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&bas->leafbufdisp_d, nRemoteLeafRanks * sizeof(PetscInt))); 1399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&bas->leafsigdisp_d, nRemoteLeafRanks * sizeof(PetscInt))); 1409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&bas->iranks_d, nRemoteLeafRanks * sizeof(PetscMPIInt))); 1419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&bas->ioffset_d, (nRemoteLeafRanks + 1) * sizeof(PetscInt))); 14271438e86SJunchao Zhang 1439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(bas->leafbufdisp_d, bas->leafbufdisp, nRemoteLeafRanks * sizeof(PetscInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 1449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(bas->leafsigdisp_d, bas->leafsigdisp, nRemoteLeafRanks * sizeof(PetscInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 1459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(bas->iranks_d, bas->iranks + bas->ndiranks, nRemoteLeafRanks * sizeof(PetscMPIInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 1469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpyAsync(bas->ioffset_d, bas->ioffset + bas->ndiranks, (nRemoteLeafRanks + 1) * sizeof(PetscInt), cudaMemcpyHostToDevice, PetscDefaultCudaStream)); 14771438e86SJunchao Zhang 1489566063dSJacob Faibussowitsch PetscCall(PetscFree2(rootreqs, leafreqs)); 14971438e86SJunchao Zhang PetscFunctionReturn(0); 15071438e86SJunchao Zhang } 15171438e86SJunchao Zhang 1529371c9d4SSatish Balay PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF sf, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, PetscBool *use_nvshmem) { 15371438e86SJunchao Zhang MPI_Comm comm; 15471438e86SJunchao Zhang PetscBool isBasic; 15571438e86SJunchao Zhang PetscMPIInt result = MPI_UNEQUAL; 15671438e86SJunchao Zhang 15771438e86SJunchao Zhang PetscFunctionBegin; 1589566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)sf, &comm)); 15971438e86SJunchao Zhang /* Check if the sf is eligible for NVSHMEM, if we have not checked yet. 16071438e86SJunchao Zhang Note the check result <use_nvshmem> must be the same over comm, since an SFLink must be collectively either NVSHMEM or MPI. 16171438e86SJunchao Zhang */ 16271438e86SJunchao Zhang sf->checked_nvshmem_eligibility = PETSC_TRUE; 16371438e86SJunchao Zhang if (sf->use_nvshmem && !sf->checked_nvshmem_eligibility) { 16471438e86SJunchao Zhang /* Only use NVSHMEM for SFBASIC on PETSC_COMM_WORLD */ 1659566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)sf, PETSCSFBASIC, &isBasic)); 1669566063dSJacob Faibussowitsch if (isBasic) PetscCallMPI(MPI_Comm_compare(PETSC_COMM_WORLD, comm, &result)); 16771438e86SJunchao Zhang if (!isBasic || (result != MPI_IDENT && result != MPI_CONGRUENT)) sf->use_nvshmem = PETSC_FALSE; /* If not eligible, clear the flag so that we don't try again */ 16871438e86SJunchao Zhang 16971438e86SJunchao Zhang /* Do further check: If on a rank, both rootdata and leafdata are NULL, we might think they are PETSC_MEMTYPE_CUDA (or HOST) 17071438e86SJunchao Zhang and then use NVSHMEM. But if root/leafmtypes on other ranks are PETSC_MEMTYPE_HOST (or DEVICE), this would lead to 17171438e86SJunchao Zhang inconsistency on the return value <use_nvshmem>. To be safe, we simply disable nvshmem on these rare SFs. 17271438e86SJunchao Zhang */ 17371438e86SJunchao Zhang if (sf->use_nvshmem) { 17471438e86SJunchao Zhang PetscInt hasNullRank = (!rootdata && !leafdata) ? 1 : 0; 1759566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &hasNullRank, 1, MPIU_INT, MPI_LOR, comm)); 17671438e86SJunchao Zhang if (hasNullRank) sf->use_nvshmem = PETSC_FALSE; 17771438e86SJunchao Zhang } 17871438e86SJunchao Zhang sf->checked_nvshmem_eligibility = PETSC_TRUE; /* If eligible, don't do above check again */ 17971438e86SJunchao Zhang } 18071438e86SJunchao Zhang 18171438e86SJunchao Zhang /* Check if rootmtype and leafmtype collectively are PETSC_MEMTYPE_CUDA */ 18271438e86SJunchao Zhang if (sf->use_nvshmem) { 18371438e86SJunchao Zhang PetscInt oneCuda = (!rootdata || PetscMemTypeCUDA(rootmtype)) && (!leafdata || PetscMemTypeCUDA(leafmtype)) ? 1 : 0; /* Do I use cuda for both root&leafmtype? */ 18471438e86SJunchao Zhang PetscInt allCuda = oneCuda; /* Assume the same for all ranks. But if not, in opt mode, return value <use_nvshmem> won't be collective! */ 18571438e86SJunchao Zhang #if defined(PETSC_USE_DEBUG) /* Check in debug mode. Note MPI_Allreduce is expensive, so only in debug mode */ 1869566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(&oneCuda, &allCuda, 1, MPIU_INT, MPI_LAND, comm)); 18708401ef6SPierre Jolivet PetscCheck(allCuda == oneCuda, comm, PETSC_ERR_SUP, "root/leaf mtypes are inconsistent among ranks, which may lead to SF nvshmem failure in opt mode. Add -use_nvshmem 0 to disable it."); 18871438e86SJunchao Zhang #endif 18971438e86SJunchao Zhang if (allCuda) { 1909566063dSJacob Faibussowitsch PetscCall(PetscNvshmemInitializeCheck()); 19171438e86SJunchao Zhang if (!sf->setup_nvshmem) { /* Set up nvshmem related fields on this SF on-demand */ 1929566063dSJacob Faibussowitsch PetscCall(PetscSFSetUp_Basic_NVSHMEM(sf)); 19371438e86SJunchao Zhang sf->setup_nvshmem = PETSC_TRUE; 19471438e86SJunchao Zhang } 19571438e86SJunchao Zhang *use_nvshmem = PETSC_TRUE; 19671438e86SJunchao Zhang } else { 19771438e86SJunchao Zhang *use_nvshmem = PETSC_FALSE; 19871438e86SJunchao Zhang } 19971438e86SJunchao Zhang } else { 20071438e86SJunchao Zhang *use_nvshmem = PETSC_FALSE; 20171438e86SJunchao Zhang } 20271438e86SJunchao Zhang PetscFunctionReturn(0); 20371438e86SJunchao Zhang } 20471438e86SJunchao Zhang 20571438e86SJunchao Zhang /* Build dependence between <stream> and <remoteCommStream> at the entry of NVSHMEM communication */ 2069371c9d4SSatish Balay static PetscErrorCode PetscSFLinkBuildDependenceBegin(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 20771438e86SJunchao Zhang cudaError_t cerr; 20871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 20971438e86SJunchao Zhang PetscInt buflen = (direction == PETSCSF_ROOT2LEAF) ? bas->rootbuflen[PETSCSF_REMOTE] : sf->leafbuflen[PETSCSF_REMOTE]; 21071438e86SJunchao Zhang 21171438e86SJunchao Zhang PetscFunctionBegin; 21271438e86SJunchao Zhang if (buflen) { 2139566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventRecord(link->dataReady, link->stream)); 2149566063dSJacob Faibussowitsch PetscCallCUDA(cudaStreamWaitEvent(link->remoteCommStream, link->dataReady, 0)); 21571438e86SJunchao Zhang } 21671438e86SJunchao Zhang PetscFunctionReturn(0); 21771438e86SJunchao Zhang } 21871438e86SJunchao Zhang 21971438e86SJunchao Zhang /* Build dependence between <stream> and <remoteCommStream> at the exit of NVSHMEM communication */ 2209371c9d4SSatish Balay static PetscErrorCode PetscSFLinkBuildDependenceEnd(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 22171438e86SJunchao Zhang cudaError_t cerr; 22271438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 22371438e86SJunchao Zhang PetscInt buflen = (direction == PETSCSF_ROOT2LEAF) ? sf->leafbuflen[PETSCSF_REMOTE] : bas->rootbuflen[PETSCSF_REMOTE]; 22471438e86SJunchao Zhang 22571438e86SJunchao Zhang PetscFunctionBegin; 22671438e86SJunchao Zhang /* If unpack to non-null device buffer, build the endRemoteComm dependance */ 22771438e86SJunchao Zhang if (buflen) { 2289566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventRecord(link->endRemoteComm, link->remoteCommStream)); 2299566063dSJacob Faibussowitsch PetscCallCUDA(cudaStreamWaitEvent(link->stream, link->endRemoteComm, 0)); 23071438e86SJunchao Zhang } 23171438e86SJunchao Zhang PetscFunctionReturn(0); 23271438e86SJunchao Zhang } 23371438e86SJunchao Zhang 23471438e86SJunchao Zhang /* Send/Put signals to remote ranks 23571438e86SJunchao Zhang 23671438e86SJunchao Zhang Input parameters: 23771438e86SJunchao Zhang + n - Number of remote ranks 23871438e86SJunchao Zhang . sig - Signal address in symmetric heap 23971438e86SJunchao Zhang . sigdisp - To i-th rank, use its signal at offset sigdisp[i] 24071438e86SJunchao Zhang . ranks - remote ranks 24171438e86SJunchao Zhang - newval - Set signals to this value 24271438e86SJunchao Zhang */ 2439371c9d4SSatish Balay __global__ static void NvshmemSendSignals(PetscInt n, uint64_t *sig, PetscInt *sigdisp, PetscMPIInt *ranks, uint64_t newval) { 24471438e86SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 24571438e86SJunchao Zhang 24671438e86SJunchao Zhang /* Each thread puts one remote signal */ 24771438e86SJunchao Zhang if (i < n) nvshmemx_uint64_signal(sig + sigdisp[i], newval, ranks[i]); 24871438e86SJunchao Zhang } 24971438e86SJunchao Zhang 25071438e86SJunchao Zhang /* Wait until local signals equal to the expected value and then set them to a new value 25171438e86SJunchao Zhang 25271438e86SJunchao Zhang Input parameters: 25371438e86SJunchao Zhang + n - Number of signals 25471438e86SJunchao Zhang . sig - Local signal address 25571438e86SJunchao Zhang . expval - expected value 25671438e86SJunchao Zhang - newval - Set signals to this new value 25771438e86SJunchao Zhang */ 2589371c9d4SSatish Balay __global__ static void NvshmemWaitSignals(PetscInt n, uint64_t *sig, uint64_t expval, uint64_t newval) { 25971438e86SJunchao Zhang #if 0 26071438e86SJunchao Zhang /* Akhil Langer@NVIDIA said using 1 thread and nvshmem_uint64_wait_until_all is better */ 26171438e86SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 26271438e86SJunchao Zhang if (i < n) { 26371438e86SJunchao Zhang nvshmem_signal_wait_until(sig+i,NVSHMEM_CMP_EQ,expval); 26471438e86SJunchao Zhang sig[i] = newval; 26571438e86SJunchao Zhang } 26671438e86SJunchao Zhang #else 26771438e86SJunchao Zhang nvshmem_uint64_wait_until_all(sig, n, NULL /*no mask*/, NVSHMEM_CMP_EQ, expval); 26871438e86SJunchao Zhang for (int i = 0; i < n; i++) sig[i] = newval; 26971438e86SJunchao Zhang #endif 27071438e86SJunchao Zhang } 27171438e86SJunchao Zhang 27271438e86SJunchao Zhang /* =========================================================================================================== 27371438e86SJunchao Zhang 27471438e86SJunchao Zhang A set of routines to support receiver initiated communication using the get method 27571438e86SJunchao Zhang 27671438e86SJunchao Zhang The getting protocol is: 27771438e86SJunchao Zhang 27871438e86SJunchao Zhang Sender has a send buf (sbuf) and a signal variable (ssig); Receiver has a recv buf (rbuf) and a signal variable (rsig); 27971438e86SJunchao Zhang All signal variables have an initial value 0. 28071438e86SJunchao Zhang 28171438e86SJunchao Zhang Sender: | Receiver: 28271438e86SJunchao Zhang 1. Wait ssig be 0, then set it to 1 28371438e86SJunchao Zhang 2. Pack data into stand alone sbuf | 28471438e86SJunchao Zhang 3. Put 1 to receiver's rsig | 1. Wait rsig to be 1, then set it 0 28571438e86SJunchao Zhang | 2. Get data from remote sbuf to local rbuf 28671438e86SJunchao Zhang | 3. Put 1 to sender's ssig 28771438e86SJunchao Zhang | 4. Unpack data from local rbuf 28871438e86SJunchao Zhang ===========================================================================================================*/ 28971438e86SJunchao Zhang /* PrePack operation -- since sender will overwrite the send buffer which the receiver might be getting data from. 29071438e86SJunchao Zhang Sender waits for signals (from receivers) indicating receivers have finished getting data 29171438e86SJunchao Zhang */ 2929371c9d4SSatish Balay PetscErrorCode PetscSFLinkWaitSignalsOfCompletionOfGettingData_NVSHMEM(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 29371438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 29471438e86SJunchao Zhang uint64_t *sig; 29571438e86SJunchao Zhang PetscInt n; 29671438e86SJunchao Zhang 29771438e86SJunchao Zhang PetscFunctionBegin; 29871438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { /* leaf ranks are getting data */ 29971438e86SJunchao Zhang sig = link->rootSendSig; /* leaf ranks set my rootSendsig */ 30071438e86SJunchao Zhang n = bas->nRemoteLeafRanks; 30171438e86SJunchao Zhang } else { /* LEAF2ROOT */ 30271438e86SJunchao Zhang sig = link->leafSendSig; 30371438e86SJunchao Zhang n = sf->nRemoteRootRanks; 30471438e86SJunchao Zhang } 30571438e86SJunchao Zhang 30671438e86SJunchao Zhang if (n) { 30771438e86SJunchao Zhang NvshmemWaitSignals<<<1, 1, 0, link->remoteCommStream>>>(n, sig, 0, 1); /* wait the signals to be 0, then set them to 1 */ 3089566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 30971438e86SJunchao Zhang } 31071438e86SJunchao Zhang PetscFunctionReturn(0); 31171438e86SJunchao Zhang } 31271438e86SJunchao Zhang 31371438e86SJunchao Zhang /* n thread blocks. Each takes in charge one remote rank */ 3149371c9d4SSatish Balay __global__ static void GetDataFromRemotelyAccessible(PetscInt nsrcranks, PetscMPIInt *srcranks, const char *src, PetscInt *srcdisp, char *dst, PetscInt *dstdisp, PetscInt unitbytes) { 31571438e86SJunchao Zhang int bid = blockIdx.x; 31671438e86SJunchao Zhang PetscMPIInt pe = srcranks[bid]; 31771438e86SJunchao Zhang 31871438e86SJunchao Zhang if (!nvshmem_ptr(src, pe)) { 31971438e86SJunchao Zhang PetscInt nelems = (dstdisp[bid + 1] - dstdisp[bid]) * unitbytes; 32071438e86SJunchao Zhang nvshmem_getmem_nbi(dst + (dstdisp[bid] - dstdisp[0]) * unitbytes, src + srcdisp[bid] * unitbytes, nelems, pe); 32171438e86SJunchao Zhang } 32271438e86SJunchao Zhang } 32371438e86SJunchao Zhang 32471438e86SJunchao Zhang /* Start communication -- Get data in the given direction */ 3259371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetDataBegin_NVSHMEM(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 32671438e86SJunchao Zhang cudaError_t cerr; 32771438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 32871438e86SJunchao Zhang 32971438e86SJunchao Zhang PetscInt nsrcranks, ndstranks, nLocallyAccessible = 0; 33071438e86SJunchao Zhang 33171438e86SJunchao Zhang char *src, *dst; 33271438e86SJunchao Zhang PetscInt *srcdisp_h, *dstdisp_h; 33371438e86SJunchao Zhang PetscInt *srcdisp_d, *dstdisp_d; 33471438e86SJunchao Zhang PetscMPIInt *srcranks_h; 33571438e86SJunchao Zhang PetscMPIInt *srcranks_d, *dstranks_d; 33671438e86SJunchao Zhang uint64_t *dstsig; 33771438e86SJunchao Zhang PetscInt *dstsigdisp_d; 33871438e86SJunchao Zhang 33971438e86SJunchao Zhang PetscFunctionBegin; 3409566063dSJacob Faibussowitsch PetscCall(PetscSFLinkBuildDependenceBegin(sf, link, direction)); 34171438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { /* src is root, dst is leaf; we will move data from src to dst */ 34271438e86SJunchao Zhang nsrcranks = sf->nRemoteRootRanks; 34371438e86SJunchao Zhang src = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* root buf is the send buf; it is in symmetric heap */ 34471438e86SJunchao Zhang 34571438e86SJunchao Zhang srcdisp_h = sf->rootbufdisp; /* for my i-th remote root rank, I will access its buf at offset rootbufdisp[i] */ 34671438e86SJunchao Zhang srcdisp_d = sf->rootbufdisp_d; 34771438e86SJunchao Zhang srcranks_h = sf->ranks + sf->ndranks; /* my (remote) root ranks */ 34871438e86SJunchao Zhang srcranks_d = sf->ranks_d; 34971438e86SJunchao Zhang 35071438e86SJunchao Zhang ndstranks = bas->nRemoteLeafRanks; 35171438e86SJunchao Zhang dst = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* recv buf is the local leaf buf, also in symmetric heap */ 35271438e86SJunchao Zhang 35371438e86SJunchao Zhang dstdisp_h = sf->roffset + sf->ndranks; /* offsets of the local leaf buf. Note dstdisp[0] is not necessarily 0 */ 35471438e86SJunchao Zhang dstdisp_d = sf->roffset_d; 35571438e86SJunchao Zhang dstranks_d = bas->iranks_d; /* my (remote) leaf ranks */ 35671438e86SJunchao Zhang 35771438e86SJunchao Zhang dstsig = link->leafRecvSig; 35871438e86SJunchao Zhang dstsigdisp_d = bas->leafsigdisp_d; 35971438e86SJunchao Zhang } else { /* src is leaf, dst is root; we will move data from src to dst */ 36071438e86SJunchao Zhang nsrcranks = bas->nRemoteLeafRanks; 36171438e86SJunchao Zhang src = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* leaf buf is the send buf */ 36271438e86SJunchao Zhang 36371438e86SJunchao Zhang srcdisp_h = bas->leafbufdisp; /* for my i-th remote root rank, I will access its buf at offset rootbufdisp[i] */ 36471438e86SJunchao Zhang srcdisp_d = bas->leafbufdisp_d; 36571438e86SJunchao Zhang srcranks_h = bas->iranks + bas->ndiranks; /* my (remote) root ranks */ 36671438e86SJunchao Zhang srcranks_d = bas->iranks_d; 36771438e86SJunchao Zhang 36871438e86SJunchao Zhang ndstranks = sf->nRemoteRootRanks; 36971438e86SJunchao Zhang dst = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* the local root buf is the recv buf */ 37071438e86SJunchao Zhang 37171438e86SJunchao Zhang dstdisp_h = bas->ioffset + bas->ndiranks; /* offsets of the local root buf. Note dstdisp[0] is not necessarily 0 */ 37271438e86SJunchao Zhang dstdisp_d = bas->ioffset_d; 37371438e86SJunchao Zhang dstranks_d = sf->ranks_d; /* my (remote) root ranks */ 37471438e86SJunchao Zhang 37571438e86SJunchao Zhang dstsig = link->rootRecvSig; 37671438e86SJunchao Zhang dstsigdisp_d = sf->rootsigdisp_d; 37771438e86SJunchao Zhang } 37871438e86SJunchao Zhang 37971438e86SJunchao Zhang /* After Pack operation -- src tells dst ranks that they are allowed to get data */ 38071438e86SJunchao Zhang if (ndstranks) { 38171438e86SJunchao Zhang NvshmemSendSignals<<<(ndstranks + 255) / 256, 256, 0, link->remoteCommStream>>>(ndstranks, dstsig, dstsigdisp_d, dstranks_d, 1); /* set signals to 1 */ 3829566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 38371438e86SJunchao Zhang } 38471438e86SJunchao Zhang 38571438e86SJunchao Zhang /* dst waits for signals (permissions) from src ranks to start getting data */ 38671438e86SJunchao Zhang if (nsrcranks) { 38771438e86SJunchao Zhang NvshmemWaitSignals<<<1, 1, 0, link->remoteCommStream>>>(nsrcranks, dstsig, 1, 0); /* wait the signals to be 1, then set them to 0 */ 3889566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 38971438e86SJunchao Zhang } 39071438e86SJunchao Zhang 39171438e86SJunchao Zhang /* dst gets data from src ranks using non-blocking nvshmem_gets, which are finished in PetscSFLinkGetDataEnd_NVSHMEM() */ 39271438e86SJunchao Zhang 39371438e86SJunchao Zhang /* Count number of locally accessible src ranks, which should be a small number */ 3949371c9d4SSatish Balay for (int i = 0; i < nsrcranks; i++) { 3959371c9d4SSatish Balay if (nvshmem_ptr(src, srcranks_h[i])) nLocallyAccessible++; 3969371c9d4SSatish Balay } 39771438e86SJunchao Zhang 39871438e86SJunchao Zhang /* Get data from remotely accessible PEs */ 39971438e86SJunchao Zhang if (nLocallyAccessible < nsrcranks) { 40071438e86SJunchao Zhang GetDataFromRemotelyAccessible<<<nsrcranks, 1, 0, link->remoteCommStream>>>(nsrcranks, srcranks_d, src, srcdisp_d, dst, dstdisp_d, link->unitbytes); 4019566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 40271438e86SJunchao Zhang } 40371438e86SJunchao Zhang 40471438e86SJunchao Zhang /* Get data from locally accessible PEs */ 40571438e86SJunchao Zhang if (nLocallyAccessible) { 40671438e86SJunchao Zhang for (int i = 0; i < nsrcranks; i++) { 40771438e86SJunchao Zhang int pe = srcranks_h[i]; 40871438e86SJunchao Zhang if (nvshmem_ptr(src, pe)) { 40971438e86SJunchao Zhang size_t nelems = (dstdisp_h[i + 1] - dstdisp_h[i]) * link->unitbytes; 41071438e86SJunchao Zhang nvshmemx_getmem_nbi_on_stream(dst + (dstdisp_h[i] - dstdisp_h[0]) * link->unitbytes, src + srcdisp_h[i] * link->unitbytes, nelems, pe, link->remoteCommStream); 41171438e86SJunchao Zhang } 41271438e86SJunchao Zhang } 41371438e86SJunchao Zhang } 41471438e86SJunchao Zhang PetscFunctionReturn(0); 41571438e86SJunchao Zhang } 41671438e86SJunchao Zhang 41771438e86SJunchao Zhang /* Finish the communication (can be done before Unpack) 41871438e86SJunchao Zhang Receiver tells its senders that they are allowed to reuse their send buffer (since receiver has got data from their send buffer) 41971438e86SJunchao Zhang */ 4209371c9d4SSatish Balay PetscErrorCode PetscSFLinkGetDataEnd_NVSHMEM(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 42171438e86SJunchao Zhang cudaError_t cerr; 42271438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 42371438e86SJunchao Zhang uint64_t *srcsig; 42471438e86SJunchao Zhang PetscInt nsrcranks, *srcsigdisp; 42571438e86SJunchao Zhang PetscMPIInt *srcranks; 42671438e86SJunchao Zhang 42771438e86SJunchao Zhang PetscFunctionBegin; 42871438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { /* leaf ranks are getting data */ 42971438e86SJunchao Zhang nsrcranks = sf->nRemoteRootRanks; 43071438e86SJunchao Zhang srcsig = link->rootSendSig; /* I want to set their root signal */ 43171438e86SJunchao Zhang srcsigdisp = sf->rootsigdisp_d; /* offset of each root signal */ 43271438e86SJunchao Zhang srcranks = sf->ranks_d; /* ranks of the n root ranks */ 43371438e86SJunchao Zhang } else { /* LEAF2ROOT, root ranks are getting data */ 43471438e86SJunchao Zhang nsrcranks = bas->nRemoteLeafRanks; 43571438e86SJunchao Zhang srcsig = link->leafSendSig; 43671438e86SJunchao Zhang srcsigdisp = bas->leafsigdisp_d; 43771438e86SJunchao Zhang srcranks = bas->iranks_d; 43871438e86SJunchao Zhang } 43971438e86SJunchao Zhang 44071438e86SJunchao Zhang if (nsrcranks) { 44171438e86SJunchao Zhang nvshmemx_quiet_on_stream(link->remoteCommStream); /* Finish the nonblocking get, so that we can unpack afterwards */ 4429566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 44371438e86SJunchao Zhang NvshmemSendSignals<<<(nsrcranks + 511) / 512, 512, 0, link->remoteCommStream>>>(nsrcranks, srcsig, srcsigdisp, srcranks, 0); /* set signals to 0 */ 4449566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 44571438e86SJunchao Zhang } 4469566063dSJacob Faibussowitsch PetscCall(PetscSFLinkBuildDependenceEnd(sf, link, direction)); 44771438e86SJunchao Zhang PetscFunctionReturn(0); 44871438e86SJunchao Zhang } 44971438e86SJunchao Zhang 45071438e86SJunchao Zhang /* =========================================================================================================== 45171438e86SJunchao Zhang 45271438e86SJunchao Zhang A set of routines to support sender initiated communication using the put-based method (the default) 45371438e86SJunchao Zhang 45471438e86SJunchao Zhang The putting protocol is: 45571438e86SJunchao Zhang 45671438e86SJunchao Zhang Sender has a send buf (sbuf) and a send signal var (ssig); Receiver has a stand-alone recv buf (rbuf) 45771438e86SJunchao Zhang and a recv signal var (rsig); All signal variables have an initial value 0. rbuf is allocated by SF and 45871438e86SJunchao Zhang is in nvshmem space. 45971438e86SJunchao Zhang 46071438e86SJunchao Zhang Sender: | Receiver: 46171438e86SJunchao Zhang | 46271438e86SJunchao Zhang 1. Pack data into sbuf | 46371438e86SJunchao Zhang 2. Wait ssig be 0, then set it to 1 | 46471438e86SJunchao Zhang 3. Put data to remote stand-alone rbuf | 46571438e86SJunchao Zhang 4. Fence // make sure 5 happens after 3 | 46671438e86SJunchao Zhang 5. Put 1 to receiver's rsig | 1. Wait rsig to be 1, then set it 0 46771438e86SJunchao Zhang | 2. Unpack data from local rbuf 46871438e86SJunchao Zhang | 3. Put 0 to sender's ssig 46971438e86SJunchao Zhang ===========================================================================================================*/ 47071438e86SJunchao Zhang 47171438e86SJunchao Zhang /* n thread blocks. Each takes in charge one remote rank */ 4729371c9d4SSatish Balay __global__ static void WaitAndPutDataToRemotelyAccessible(PetscInt ndstranks, PetscMPIInt *dstranks, char *dst, PetscInt *dstdisp, const char *src, PetscInt *srcdisp, uint64_t *srcsig, PetscInt unitbytes) { 47371438e86SJunchao Zhang int bid = blockIdx.x; 47471438e86SJunchao Zhang PetscMPIInt pe = dstranks[bid]; 47571438e86SJunchao Zhang 47671438e86SJunchao Zhang if (!nvshmem_ptr(dst, pe)) { 47771438e86SJunchao Zhang PetscInt nelems = (srcdisp[bid + 1] - srcdisp[bid]) * unitbytes; 47871438e86SJunchao Zhang nvshmem_uint64_wait_until(srcsig + bid, NVSHMEM_CMP_EQ, 0); /* Wait until the sig = 0 */ 47971438e86SJunchao Zhang srcsig[bid] = 1; 48071438e86SJunchao Zhang nvshmem_putmem_nbi(dst + dstdisp[bid] * unitbytes, src + (srcdisp[bid] - srcdisp[0]) * unitbytes, nelems, pe); 48171438e86SJunchao Zhang } 48271438e86SJunchao Zhang } 48371438e86SJunchao Zhang 48471438e86SJunchao Zhang /* one-thread kernel, which takes in charge all locally accesible */ 4859371c9d4SSatish Balay __global__ static void WaitSignalsFromLocallyAccessible(PetscInt ndstranks, PetscMPIInt *dstranks, uint64_t *srcsig, const char *dst) { 48671438e86SJunchao Zhang for (int i = 0; i < ndstranks; i++) { 48771438e86SJunchao Zhang int pe = dstranks[i]; 48871438e86SJunchao Zhang if (nvshmem_ptr(dst, pe)) { 48971438e86SJunchao Zhang nvshmem_uint64_wait_until(srcsig + i, NVSHMEM_CMP_EQ, 0); /* Wait until the sig = 0 */ 49071438e86SJunchao Zhang srcsig[i] = 1; 49171438e86SJunchao Zhang } 49271438e86SJunchao Zhang } 49371438e86SJunchao Zhang } 49471438e86SJunchao Zhang 49571438e86SJunchao Zhang /* Put data in the given direction */ 4969371c9d4SSatish Balay PetscErrorCode PetscSFLinkPutDataBegin_NVSHMEM(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 49771438e86SJunchao Zhang cudaError_t cerr; 49871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 49971438e86SJunchao Zhang PetscInt ndstranks, nLocallyAccessible = 0; 50071438e86SJunchao Zhang char *src, *dst; 50171438e86SJunchao Zhang PetscInt *srcdisp_h, *dstdisp_h; 50271438e86SJunchao Zhang PetscInt *srcdisp_d, *dstdisp_d; 50371438e86SJunchao Zhang PetscMPIInt *dstranks_h; 50471438e86SJunchao Zhang PetscMPIInt *dstranks_d; 50571438e86SJunchao Zhang uint64_t *srcsig; 50671438e86SJunchao Zhang 50771438e86SJunchao Zhang PetscFunctionBegin; 5089566063dSJacob Faibussowitsch PetscCall(PetscSFLinkBuildDependenceBegin(sf, link, direction)); 50971438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { /* put data in rootbuf to leafbuf */ 51071438e86SJunchao Zhang ndstranks = bas->nRemoteLeafRanks; /* number of (remote) leaf ranks */ 51171438e86SJunchao Zhang src = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; /* Both src & dst must be symmetric */ 51271438e86SJunchao Zhang dst = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 51371438e86SJunchao Zhang 51471438e86SJunchao Zhang srcdisp_h = bas->ioffset + bas->ndiranks; /* offsets of rootbuf. srcdisp[0] is not necessarily zero */ 51571438e86SJunchao Zhang srcdisp_d = bas->ioffset_d; 51671438e86SJunchao Zhang srcsig = link->rootSendSig; 51771438e86SJunchao Zhang 51871438e86SJunchao Zhang dstdisp_h = bas->leafbufdisp; /* for my i-th remote leaf rank, I will access its leaf buf at offset leafbufdisp[i] */ 51971438e86SJunchao Zhang dstdisp_d = bas->leafbufdisp_d; 52071438e86SJunchao Zhang dstranks_h = bas->iranks + bas->ndiranks; /* remote leaf ranks */ 52171438e86SJunchao Zhang dstranks_d = bas->iranks_d; 52271438e86SJunchao Zhang } else { /* put data in leafbuf to rootbuf */ 52371438e86SJunchao Zhang ndstranks = sf->nRemoteRootRanks; 52471438e86SJunchao Zhang src = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 52571438e86SJunchao Zhang dst = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 52671438e86SJunchao Zhang 52771438e86SJunchao Zhang srcdisp_h = sf->roffset + sf->ndranks; /* offsets of leafbuf */ 52871438e86SJunchao Zhang srcdisp_d = sf->roffset_d; 52971438e86SJunchao Zhang srcsig = link->leafSendSig; 53071438e86SJunchao Zhang 53171438e86SJunchao Zhang dstdisp_h = sf->rootbufdisp; /* for my i-th remote root rank, I will access its root buf at offset rootbufdisp[i] */ 53271438e86SJunchao Zhang dstdisp_d = sf->rootbufdisp_d; 53371438e86SJunchao Zhang dstranks_h = sf->ranks + sf->ndranks; /* remote root ranks */ 53471438e86SJunchao Zhang dstranks_d = sf->ranks_d; 53571438e86SJunchao Zhang } 53671438e86SJunchao Zhang 53771438e86SJunchao Zhang /* Wait for signals and then put data to dst ranks using non-blocking nvshmem_put, which are finished in PetscSFLinkPutDataEnd_NVSHMEM */ 53871438e86SJunchao Zhang 53971438e86SJunchao Zhang /* Count number of locally accessible neighbors, which should be a small number */ 5409371c9d4SSatish Balay for (int i = 0; i < ndstranks; i++) { 5419371c9d4SSatish Balay if (nvshmem_ptr(dst, dstranks_h[i])) nLocallyAccessible++; 5429371c9d4SSatish Balay } 54371438e86SJunchao Zhang 54471438e86SJunchao Zhang /* For remotely accessible PEs, send data to them in one kernel call */ 54571438e86SJunchao Zhang if (nLocallyAccessible < ndstranks) { 54671438e86SJunchao Zhang WaitAndPutDataToRemotelyAccessible<<<ndstranks, 1, 0, link->remoteCommStream>>>(ndstranks, dstranks_d, dst, dstdisp_d, src, srcdisp_d, srcsig, link->unitbytes); 5479566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 54871438e86SJunchao Zhang } 54971438e86SJunchao Zhang 55071438e86SJunchao Zhang /* For locally accessible PEs, use host API, which uses CUDA copy-engines and is much faster than device API */ 55171438e86SJunchao Zhang if (nLocallyAccessible) { 55271438e86SJunchao Zhang WaitSignalsFromLocallyAccessible<<<1, 1, 0, link->remoteCommStream>>>(ndstranks, dstranks_d, srcsig, dst); 55371438e86SJunchao Zhang for (int i = 0; i < ndstranks; i++) { 55471438e86SJunchao Zhang int pe = dstranks_h[i]; 55571438e86SJunchao Zhang if (nvshmem_ptr(dst, pe)) { /* If return a non-null pointer, then <pe> is locally accessible */ 55671438e86SJunchao Zhang size_t nelems = (srcdisp_h[i + 1] - srcdisp_h[i]) * link->unitbytes; 55771438e86SJunchao Zhang /* Initiate the nonblocking communication */ 55871438e86SJunchao Zhang nvshmemx_putmem_nbi_on_stream(dst + dstdisp_h[i] * link->unitbytes, src + (srcdisp_h[i] - srcdisp_h[0]) * link->unitbytes, nelems, pe, link->remoteCommStream); 55971438e86SJunchao Zhang } 56071438e86SJunchao Zhang } 56171438e86SJunchao Zhang } 56271438e86SJunchao Zhang 5639371c9d4SSatish Balay if (nLocallyAccessible) { nvshmemx_quiet_on_stream(link->remoteCommStream); /* Calling nvshmem_fence/quiet() does not fence the above nvshmemx_putmem_nbi_on_stream! */ } 56471438e86SJunchao Zhang PetscFunctionReturn(0); 56571438e86SJunchao Zhang } 56671438e86SJunchao Zhang 56771438e86SJunchao Zhang /* A one-thread kernel. The thread takes in charge all remote PEs */ 5689371c9d4SSatish Balay __global__ static void PutDataEnd(PetscInt nsrcranks, PetscInt ndstranks, PetscMPIInt *dstranks, uint64_t *dstsig, PetscInt *dstsigdisp) { 56971438e86SJunchao Zhang /* TODO: Shall we finished the non-blocking remote puts? */ 57071438e86SJunchao Zhang 57171438e86SJunchao Zhang /* 1. Send a signal to each dst rank */ 57271438e86SJunchao Zhang 57371438e86SJunchao Zhang /* According to Akhil@NVIDIA, IB is orderred, so no fence is needed for remote PEs. 57471438e86SJunchao Zhang For local PEs, we already called nvshmemx_quiet_on_stream(). Therefore, we are good to send signals to all dst ranks now. 57571438e86SJunchao Zhang */ 57671438e86SJunchao Zhang for (int i = 0; i < ndstranks; i++) { nvshmemx_uint64_signal(dstsig + dstsigdisp[i], 1, dstranks[i]); } /* set sig to 1 */ 57771438e86SJunchao Zhang 57871438e86SJunchao Zhang /* 2. Wait for signals from src ranks (if any) */ 57971438e86SJunchao Zhang if (nsrcranks) { 58071438e86SJunchao Zhang nvshmem_uint64_wait_until_all(dstsig, nsrcranks, NULL /*no mask*/, NVSHMEM_CMP_EQ, 1); /* wait sigs to be 1, then set them to 0 */ 58171438e86SJunchao Zhang for (int i = 0; i < nsrcranks; i++) dstsig[i] = 0; 58271438e86SJunchao Zhang } 58371438e86SJunchao Zhang } 58471438e86SJunchao Zhang 58571438e86SJunchao Zhang /* Finish the communication -- A receiver waits until it can access its receive buffer */ 5869371c9d4SSatish Balay PetscErrorCode PetscSFLinkPutDataEnd_NVSHMEM(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 58771438e86SJunchao Zhang cudaError_t cerr; 58871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 58971438e86SJunchao Zhang PetscMPIInt *dstranks; 59071438e86SJunchao Zhang uint64_t *dstsig; 59171438e86SJunchao Zhang PetscInt nsrcranks, ndstranks, *dstsigdisp; 59271438e86SJunchao Zhang 59371438e86SJunchao Zhang PetscFunctionBegin; 59471438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { /* put root data to leaf */ 59571438e86SJunchao Zhang nsrcranks = sf->nRemoteRootRanks; 59671438e86SJunchao Zhang 59771438e86SJunchao Zhang ndstranks = bas->nRemoteLeafRanks; 59871438e86SJunchao Zhang dstranks = bas->iranks_d; /* leaf ranks */ 59971438e86SJunchao Zhang dstsig = link->leafRecvSig; /* I will set my leaf ranks's RecvSig */ 60071438e86SJunchao Zhang dstsigdisp = bas->leafsigdisp_d; /* for my i-th remote leaf rank, I will access its signal at offset leafsigdisp[i] */ 60171438e86SJunchao Zhang } else { /* LEAF2ROOT */ 60271438e86SJunchao Zhang nsrcranks = bas->nRemoteLeafRanks; 60371438e86SJunchao Zhang 60471438e86SJunchao Zhang ndstranks = sf->nRemoteRootRanks; 60571438e86SJunchao Zhang dstranks = sf->ranks_d; 60671438e86SJunchao Zhang dstsig = link->rootRecvSig; 60771438e86SJunchao Zhang dstsigdisp = sf->rootsigdisp_d; 60871438e86SJunchao Zhang } 60971438e86SJunchao Zhang 61071438e86SJunchao Zhang if (nsrcranks || ndstranks) { 61171438e86SJunchao Zhang PutDataEnd<<<1, 1, 0, link->remoteCommStream>>>(nsrcranks, ndstranks, dstranks, dstsig, dstsigdisp); 6129566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 61371438e86SJunchao Zhang } 6149566063dSJacob Faibussowitsch PetscCall(PetscSFLinkBuildDependenceEnd(sf, link, direction)); 61571438e86SJunchao Zhang PetscFunctionReturn(0); 61671438e86SJunchao Zhang } 61771438e86SJunchao Zhang 61871438e86SJunchao Zhang /* PostUnpack operation -- A receiver tells its senders that they are allowed to put data to here (it implies recv buf is free to take new data) */ 6199371c9d4SSatish Balay PetscErrorCode PetscSFLinkSendSignalsToAllowPuttingData_NVSHMEM(PetscSF sf, PetscSFLink link, PetscSFDirection direction) { 62071438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 62171438e86SJunchao Zhang uint64_t *srcsig; 62271438e86SJunchao Zhang PetscInt nsrcranks, *srcsigdisp_d; 62371438e86SJunchao Zhang PetscMPIInt *srcranks_d; 62471438e86SJunchao Zhang 62571438e86SJunchao Zhang PetscFunctionBegin; 62671438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { /* I allow my root ranks to put data to me */ 62771438e86SJunchao Zhang nsrcranks = sf->nRemoteRootRanks; 62871438e86SJunchao Zhang srcsig = link->rootSendSig; /* I want to set their send signals */ 62971438e86SJunchao Zhang srcsigdisp_d = sf->rootsigdisp_d; /* offset of each root signal */ 63071438e86SJunchao Zhang srcranks_d = sf->ranks_d; /* ranks of the n root ranks */ 63171438e86SJunchao Zhang } else { /* LEAF2ROOT */ 63271438e86SJunchao Zhang nsrcranks = bas->nRemoteLeafRanks; 63371438e86SJunchao Zhang srcsig = link->leafSendSig; 63471438e86SJunchao Zhang srcsigdisp_d = bas->leafsigdisp_d; 63571438e86SJunchao Zhang srcranks_d = bas->iranks_d; 63671438e86SJunchao Zhang } 63771438e86SJunchao Zhang 63871438e86SJunchao Zhang if (nsrcranks) { 63971438e86SJunchao Zhang NvshmemSendSignals<<<(nsrcranks + 255) / 256, 256, 0, link->remoteCommStream>>>(nsrcranks, srcsig, srcsigdisp_d, srcranks_d, 0); /* Set remote signals to 0 */ 6409566063dSJacob Faibussowitsch PetscCallCUDA(cudaGetLastError()); 64171438e86SJunchao Zhang } 64271438e86SJunchao Zhang PetscFunctionReturn(0); 64371438e86SJunchao Zhang } 64471438e86SJunchao Zhang 64571438e86SJunchao Zhang /* Destructor when the link uses nvshmem for communication */ 6469371c9d4SSatish Balay static PetscErrorCode PetscSFLinkDestroy_NVSHMEM(PetscSF sf, PetscSFLink link) { 64771438e86SJunchao Zhang cudaError_t cerr; 64871438e86SJunchao Zhang 64971438e86SJunchao Zhang PetscFunctionBegin; 6509566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventDestroy(link->dataReady)); 6519566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventDestroy(link->endRemoteComm)); 6529566063dSJacob Faibussowitsch PetscCallCUDA(cudaStreamDestroy(link->remoteCommStream)); 65371438e86SJunchao Zhang 65471438e86SJunchao Zhang /* nvshmem does not need buffers on host, which should be NULL */ 6559566063dSJacob Faibussowitsch PetscCall(PetscNvshmemFree(link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE])); 6569566063dSJacob Faibussowitsch PetscCall(PetscNvshmemFree(link->leafSendSig)); 6579566063dSJacob Faibussowitsch PetscCall(PetscNvshmemFree(link->leafRecvSig)); 6589566063dSJacob Faibussowitsch PetscCall(PetscNvshmemFree(link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE])); 6599566063dSJacob Faibussowitsch PetscCall(PetscNvshmemFree(link->rootSendSig)); 6609566063dSJacob Faibussowitsch PetscCall(PetscNvshmemFree(link->rootRecvSig)); 66171438e86SJunchao Zhang PetscFunctionReturn(0); 66271438e86SJunchao Zhang } 66371438e86SJunchao Zhang 6649371c9d4SSatish Balay PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) { 66571438e86SJunchao Zhang cudaError_t cerr; 66671438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 66771438e86SJunchao Zhang PetscSFLink *p, link; 66871438e86SJunchao Zhang PetscBool match, rootdirect[2], leafdirect[2]; 66971438e86SJunchao Zhang int greatestPriority; 67071438e86SJunchao Zhang 67171438e86SJunchao Zhang PetscFunctionBegin; 67271438e86SJunchao Zhang /* Check to see if we can directly send/recv root/leafdata with the given sf, sfop and op. 67371438e86SJunchao Zhang We only care root/leafdirect[PETSCSF_REMOTE], since we never need intermeidate buffers in local communication with NVSHMEM. 67471438e86SJunchao Zhang */ 67571438e86SJunchao Zhang if (sfop == PETSCSF_BCAST) { /* Move data from rootbuf to leafbuf */ 67671438e86SJunchao Zhang if (sf->use_nvshmem_get) { 67771438e86SJunchao Zhang rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* send buffer has to be stand-alone (can't be rootdata) */ 67871438e86SJunchao Zhang leafdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(leafmtype) && sf->leafcontig[PETSCSF_REMOTE] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; 67971438e86SJunchao Zhang } else { 68071438e86SJunchao Zhang rootdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(rootmtype) && bas->rootcontig[PETSCSF_REMOTE]) ? PETSC_TRUE : PETSC_FALSE; 68171438e86SJunchao Zhang leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* Our put-protocol always needs a nvshmem alloc'ed recv buffer */ 68271438e86SJunchao Zhang } 68371438e86SJunchao Zhang } else if (sfop == PETSCSF_REDUCE) { /* Move data from leafbuf to rootbuf */ 68471438e86SJunchao Zhang if (sf->use_nvshmem_get) { 68571438e86SJunchao Zhang rootdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(rootmtype) && bas->rootcontig[PETSCSF_REMOTE] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; 68671438e86SJunchao Zhang leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; 68771438e86SJunchao Zhang } else { 68871438e86SJunchao Zhang rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; 68971438e86SJunchao Zhang leafdirect[PETSCSF_REMOTE] = (PetscMemTypeNVSHMEM(leafmtype) && sf->leafcontig[PETSCSF_REMOTE]) ? PETSC_TRUE : PETSC_FALSE; 69071438e86SJunchao Zhang } 69171438e86SJunchao Zhang } else { /* PETSCSF_FETCH */ 69271438e86SJunchao Zhang rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 69371438e86SJunchao Zhang leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 69471438e86SJunchao Zhang } 69571438e86SJunchao Zhang 69671438e86SJunchao Zhang /* Look for free nvshmem links in cache */ 69771438e86SJunchao Zhang for (p = &bas->avail; (link = *p); p = &link->next) { 69871438e86SJunchao Zhang if (link->use_nvshmem) { 6999566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 70071438e86SJunchao Zhang if (match) { 70171438e86SJunchao Zhang *p = link->next; /* Remove from available list */ 70271438e86SJunchao Zhang goto found; 70371438e86SJunchao Zhang } 70471438e86SJunchao Zhang } 70571438e86SJunchao Zhang } 7069566063dSJacob Faibussowitsch PetscCall(PetscNew(&link)); 7079566063dSJacob Faibussowitsch PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); /* Compute link->unitbytes, dup link->unit etc. */ 7089566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup pack routines, streams etc */ 70971438e86SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS) 7109566063dSJacob Faibussowitsch else if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 71171438e86SJunchao Zhang #endif 71271438e86SJunchao Zhang 71371438e86SJunchao Zhang link->rootdirect[PETSCSF_LOCAL] = PETSC_TRUE; /* For the local part we directly use root/leafdata */ 71471438e86SJunchao Zhang link->leafdirect[PETSCSF_LOCAL] = PETSC_TRUE; 71571438e86SJunchao Zhang 71671438e86SJunchao Zhang /* Init signals to zero */ 7179566063dSJacob Faibussowitsch if (!link->rootSendSig) PetscCall(PetscNvshmemCalloc(bas->nRemoteLeafRanksMax * sizeof(uint64_t), (void **)&link->rootSendSig)); 7189566063dSJacob Faibussowitsch if (!link->rootRecvSig) PetscCall(PetscNvshmemCalloc(bas->nRemoteLeafRanksMax * sizeof(uint64_t), (void **)&link->rootRecvSig)); 7199566063dSJacob Faibussowitsch if (!link->leafSendSig) PetscCall(PetscNvshmemCalloc(sf->nRemoteRootRanksMax * sizeof(uint64_t), (void **)&link->leafSendSig)); 7209566063dSJacob Faibussowitsch if (!link->leafRecvSig) PetscCall(PetscNvshmemCalloc(sf->nRemoteRootRanksMax * sizeof(uint64_t), (void **)&link->leafRecvSig)); 72171438e86SJunchao Zhang 72271438e86SJunchao Zhang link->use_nvshmem = PETSC_TRUE; 72371438e86SJunchao Zhang link->rootmtype = PETSC_MEMTYPE_DEVICE; /* Only need 0/1-based mtype from now on */ 72471438e86SJunchao Zhang link->leafmtype = PETSC_MEMTYPE_DEVICE; 72571438e86SJunchao Zhang /* Overwrite some function pointers set by PetscSFLinkSetUp_CUDA */ 72671438e86SJunchao Zhang link->Destroy = PetscSFLinkDestroy_NVSHMEM; 72771438e86SJunchao Zhang if (sf->use_nvshmem_get) { /* get-based protocol */ 72871438e86SJunchao Zhang link->PrePack = PetscSFLinkWaitSignalsOfCompletionOfGettingData_NVSHMEM; 72971438e86SJunchao Zhang link->StartCommunication = PetscSFLinkGetDataBegin_NVSHMEM; 73071438e86SJunchao Zhang link->FinishCommunication = PetscSFLinkGetDataEnd_NVSHMEM; 73171438e86SJunchao Zhang } else { /* put-based protocol */ 73271438e86SJunchao Zhang link->StartCommunication = PetscSFLinkPutDataBegin_NVSHMEM; 73371438e86SJunchao Zhang link->FinishCommunication = PetscSFLinkPutDataEnd_NVSHMEM; 73471438e86SJunchao Zhang link->PostUnpack = PetscSFLinkSendSignalsToAllowPuttingData_NVSHMEM; 73571438e86SJunchao Zhang } 73671438e86SJunchao Zhang 7379566063dSJacob Faibussowitsch PetscCallCUDA(cudaDeviceGetStreamPriorityRange(NULL, &greatestPriority)); 7389566063dSJacob Faibussowitsch PetscCallCUDA(cudaStreamCreateWithPriority(&link->remoteCommStream, cudaStreamNonBlocking, greatestPriority)); 73971438e86SJunchao Zhang 7409566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventCreateWithFlags(&link->dataReady, cudaEventDisableTiming)); 7419566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventCreateWithFlags(&link->endRemoteComm, cudaEventDisableTiming)); 74271438e86SJunchao Zhang 74371438e86SJunchao Zhang found: 74471438e86SJunchao Zhang if (rootdirect[PETSCSF_REMOTE]) { 74571438e86SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = (char *)rootdata + bas->rootstart[PETSCSF_REMOTE] * link->unitbytes; 74671438e86SJunchao Zhang } else { 747*48a46eb9SPierre Jolivet if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]) PetscCall(PetscNvshmemMalloc(bas->rootbuflen_rmax * link->unitbytes, (void **)&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE])); 74871438e86SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 74971438e86SJunchao Zhang } 75071438e86SJunchao Zhang 75171438e86SJunchao Zhang if (leafdirect[PETSCSF_REMOTE]) { 75271438e86SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = (char *)leafdata + sf->leafstart[PETSCSF_REMOTE] * link->unitbytes; 75371438e86SJunchao Zhang } else { 754*48a46eb9SPierre Jolivet if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]) PetscCall(PetscNvshmemMalloc(sf->leafbuflen_rmax * link->unitbytes, (void **)&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE])); 75571438e86SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 75671438e86SJunchao Zhang } 75771438e86SJunchao Zhang 75871438e86SJunchao Zhang link->rootdirect[PETSCSF_REMOTE] = rootdirect[PETSCSF_REMOTE]; 75971438e86SJunchao Zhang link->leafdirect[PETSCSF_REMOTE] = leafdirect[PETSCSF_REMOTE]; 76071438e86SJunchao Zhang link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 76171438e86SJunchao Zhang link->leafdata = leafdata; 76271438e86SJunchao Zhang link->next = bas->inuse; 76371438e86SJunchao Zhang bas->inuse = link; 76471438e86SJunchao Zhang *mylink = link; 76571438e86SJunchao Zhang PetscFunctionReturn(0); 76671438e86SJunchao Zhang } 76771438e86SJunchao Zhang 76871438e86SJunchao Zhang #if defined(PETSC_USE_REAL_SINGLE) 7699371c9d4SSatish Balay PetscErrorCode PetscNvshmemSum(PetscInt count, float *dst, const float *src) { 77071438e86SJunchao Zhang PetscMPIInt num; /* Assume nvshmem's int is MPI's int */ 77171438e86SJunchao Zhang 77271438e86SJunchao Zhang PetscFunctionBegin; 7739566063dSJacob Faibussowitsch PetscCall(PetscMPIIntCast(count, &num)); 77471438e86SJunchao Zhang nvshmemx_float_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD, dst, src, num, PetscDefaultCudaStream); 77571438e86SJunchao Zhang PetscFunctionReturn(0); 77671438e86SJunchao Zhang } 77771438e86SJunchao Zhang 7789371c9d4SSatish Balay PetscErrorCode PetscNvshmemMax(PetscInt count, float *dst, const float *src) { 77971438e86SJunchao Zhang PetscMPIInt num; 78071438e86SJunchao Zhang 78171438e86SJunchao Zhang PetscFunctionBegin; 7829566063dSJacob Faibussowitsch PetscCall(PetscMPIIntCast(count, &num)); 78371438e86SJunchao Zhang nvshmemx_float_max_reduce_on_stream(NVSHMEM_TEAM_WORLD, dst, src, num, PetscDefaultCudaStream); 78471438e86SJunchao Zhang PetscFunctionReturn(0); 78571438e86SJunchao Zhang } 78671438e86SJunchao Zhang #elif defined(PETSC_USE_REAL_DOUBLE) 7879371c9d4SSatish Balay PetscErrorCode PetscNvshmemSum(PetscInt count, double *dst, const double *src) { 78871438e86SJunchao Zhang PetscMPIInt num; 78971438e86SJunchao Zhang 79071438e86SJunchao Zhang PetscFunctionBegin; 7919566063dSJacob Faibussowitsch PetscCall(PetscMPIIntCast(count, &num)); 79271438e86SJunchao Zhang nvshmemx_double_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD, dst, src, num, PetscDefaultCudaStream); 79371438e86SJunchao Zhang PetscFunctionReturn(0); 79471438e86SJunchao Zhang } 79571438e86SJunchao Zhang 7969371c9d4SSatish Balay PetscErrorCode PetscNvshmemMax(PetscInt count, double *dst, const double *src) { 79771438e86SJunchao Zhang PetscMPIInt num; 79871438e86SJunchao Zhang 79971438e86SJunchao Zhang PetscFunctionBegin; 8009566063dSJacob Faibussowitsch PetscCall(PetscMPIIntCast(count, &num)); 80171438e86SJunchao Zhang nvshmemx_double_max_reduce_on_stream(NVSHMEM_TEAM_WORLD, dst, src, num, PetscDefaultCudaStream); 80271438e86SJunchao Zhang PetscFunctionReturn(0); 80371438e86SJunchao Zhang } 80471438e86SJunchao Zhang #endif 805