171438e86SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 271438e86SJunchao Zhang 3*d8b4a066SPierre Jolivet // Though there is no default mechanism to start a communication, we have a 4f5d27ee7SJunchao Zhang // default to finish communication, which is just waiting on the requests. 5f5d27ee7SJunchao Zhang // It should work for both non-blocking or persistent send/recvs or collectivwes. 6f5d27ee7SJunchao Zhang static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 7d71ae5a4SJacob Faibussowitsch { 871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 971438e86SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 1071438e86SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 1171438e86SJunchao Zhang 1271438e86SJunchao Zhang PetscFunctionBegin; 13f5d27ee7SJunchao Zhang if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 14f5d27ee7SJunchao Zhang if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 1571438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 169566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 1771438e86SJunchao Zhang } else { 189566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 1971438e86SJunchao Zhang } 203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2171438e86SJunchao Zhang } 2271438e86SJunchao Zhang 2371438e86SJunchao Zhang /* 2471438e86SJunchao Zhang The routine Creates a communication link for the given operation. It first looks up its link cache. If 2571438e86SJunchao Zhang there is a free & suitable one, it uses it. Otherwise it creates a new one. 2671438e86SJunchao Zhang 2771438e86SJunchao Zhang A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 2871438e86SJunchao Zhang root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 2971438e86SJunchao Zhang can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 3071438e86SJunchao Zhang those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 3171438e86SJunchao Zhang 3271438e86SJunchao Zhang The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 3371438e86SJunchao Zhang 3471438e86SJunchao Zhang In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 3571438e86SJunchao Zhang 3671438e86SJunchao Zhang The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 3771438e86SJunchao Zhang need pack/unpack data. 3871438e86SJunchao Zhang */ 39d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 40d71ae5a4SJacob Faibussowitsch { 4171438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 4271438e86SJunchao Zhang PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 4371438e86SJunchao Zhang PetscSFLink *p, link; 4471438e86SJunchao Zhang PetscSFDirection direction; 4571438e86SJunchao Zhang MPI_Request *reqs = NULL; 4671438e86SJunchao Zhang PetscBool match, rootdirect[2], leafdirect[2]; 4771438e86SJunchao Zhang PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 4871438e86SJunchao Zhang PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 4971438e86SJunchao Zhang PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 5071438e86SJunchao Zhang PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 5171438e86SJunchao Zhang 5271438e86SJunchao Zhang PetscFunctionBegin; 5371438e86SJunchao Zhang /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 5471438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 5571438e86SJunchao Zhang if (sfop == PETSCSF_BCAST) { 5671438e86SJunchao Zhang rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 5771438e86SJunchao Zhang leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 5871438e86SJunchao Zhang } else if (sfop == PETSCSF_REDUCE) { 5971438e86SJunchao Zhang leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 6071438e86SJunchao Zhang rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 6171438e86SJunchao Zhang } else { /* PETSCSF_FETCH */ 6271438e86SJunchao Zhang rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 6371438e86SJunchao Zhang leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 6471438e86SJunchao Zhang } 6571438e86SJunchao Zhang } 6671438e86SJunchao Zhang 676677b1c1SJunchao Zhang // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build 686677b1c1SJunchao Zhang // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect 696677b1c1SJunchao Zhang // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time. 706677b1c1SJunchao Zhang // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call 716677b1c1SJunchao Zhang // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication. 726677b1c1SJunchao Zhang // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives. 736677b1c1SJunchao Zhang if (sf->persistent && sf->collective) { 746677b1c1SJunchao Zhang rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; 756677b1c1SJunchao Zhang leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; 766677b1c1SJunchao Zhang } 776677b1c1SJunchao Zhang 7871438e86SJunchao Zhang if (sf->use_gpu_aware_mpi) { 7971438e86SJunchao Zhang rootmtype_mpi = rootmtype; 8071438e86SJunchao Zhang leafmtype_mpi = leafmtype; 8171438e86SJunchao Zhang } else { 8271438e86SJunchao Zhang rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 8371438e86SJunchao Zhang } 84da81f932SPierre Jolivet /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */ 8571438e86SJunchao Zhang rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 8671438e86SJunchao Zhang leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 8771438e86SJunchao Zhang 8871438e86SJunchao Zhang direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 8971438e86SJunchao Zhang nrootreqs = bas->nrootreqs; 9071438e86SJunchao Zhang nleafreqs = sf->nleafreqs; 9171438e86SJunchao Zhang 9271438e86SJunchao Zhang /* Look for free links in cache */ 9371438e86SJunchao Zhang for (p = &bas->avail; (link = *p); p = &link->next) { 9471438e86SJunchao Zhang if (!link->use_nvshmem) { /* Only check with MPI links */ 959566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 9671438e86SJunchao Zhang if (match) { 9771438e86SJunchao Zhang /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 986677b1c1SJunchao Zhang If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag. 9971438e86SJunchao Zhang */ 10071438e86SJunchao Zhang if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 10171438e86SJunchao Zhang reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 1029371c9d4SSatish Balay for (i = 0; i < nrootreqs; i++) { 1039371c9d4SSatish Balay if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 1049371c9d4SSatish Balay } 10571438e86SJunchao Zhang link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 10671438e86SJunchao Zhang } 10771438e86SJunchao Zhang if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 10871438e86SJunchao Zhang reqs = link->leafreqs[direction][leafmtype][1]; 1099371c9d4SSatish Balay for (i = 0; i < nleafreqs; i++) { 1109371c9d4SSatish Balay if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 1119371c9d4SSatish Balay } 11271438e86SJunchao Zhang link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 11371438e86SJunchao Zhang } 11471438e86SJunchao Zhang *p = link->next; /* Remove from available list */ 11571438e86SJunchao Zhang goto found; 11671438e86SJunchao Zhang } 11771438e86SJunchao Zhang } 11871438e86SJunchao Zhang } 11971438e86SJunchao Zhang 1209566063dSJacob Faibussowitsch PetscCall(PetscNew(&link)); 1219566063dSJacob Faibussowitsch PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 1229566063dSJacob Faibussowitsch PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 12371438e86SJunchao Zhang 12471438e86SJunchao Zhang nreqs = (nrootreqs + nleafreqs) * 8; 1259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nreqs, &link->reqs)); 12671438e86SJunchao Zhang for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 12771438e86SJunchao Zhang 1285c0db29aSPierre Jolivet if (nreqs) 12971438e86SJunchao Zhang for (i = 0; i < 2; i++) { /* Two communication directions */ 13071438e86SJunchao Zhang for (j = 0; j < 2; j++) { /* Two memory types */ 13171438e86SJunchao Zhang for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 13271438e86SJunchao Zhang link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 13371438e86SJunchao Zhang link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 13471438e86SJunchao Zhang } 13571438e86SJunchao Zhang } 13671438e86SJunchao Zhang } 137f5d27ee7SJunchao Zhang 138f5d27ee7SJunchao Zhang link->FinishCommunication = PetscSFLinkFinishCommunication_Default; 139f5d27ee7SJunchao Zhang // each SF type could customize their communication by setting function pointers in the link. 140f5d27ee7SJunchao Zhang // Currently only BASIC and NEIGHBOR use this abstraction. 141f5d27ee7SJunchao Zhang PetscTryTypeMethod(sf, SetCommunicationOps, link); 14271438e86SJunchao Zhang 14371438e86SJunchao Zhang found: 14471438e86SJunchao Zhang 14571438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 14671438e86SJunchao Zhang if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 14771438e86SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1489566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 14971438e86SJunchao Zhang #endif 15071438e86SJunchao Zhang #if defined(PETSC_HAVE_HIP) 1519566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 15271438e86SJunchao Zhang #endif 15371438e86SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS) 1549566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 15571438e86SJunchao Zhang #endif 15671438e86SJunchao Zhang } 15771438e86SJunchao Zhang #endif 15871438e86SJunchao Zhang 15971438e86SJunchao Zhang /* Allocate buffers along root/leafdata */ 16071438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 16171438e86SJunchao Zhang /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 16271438e86SJunchao Zhang if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 16371438e86SJunchao Zhang if (bas->rootbuflen[i]) { 16471438e86SJunchao Zhang if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 16571438e86SJunchao Zhang link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 16671438e86SJunchao Zhang } else { /* Have to have a separate rootbuf */ 16748a46eb9SPierre Jolivet if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 16871438e86SJunchao Zhang link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 16971438e86SJunchao Zhang } 17071438e86SJunchao Zhang } 17171438e86SJunchao Zhang 17271438e86SJunchao Zhang if (sf->leafbuflen[i]) { 17371438e86SJunchao Zhang if (leafdirect[i]) { 17471438e86SJunchao Zhang link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 17571438e86SJunchao Zhang } else { 17648a46eb9SPierre Jolivet if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 17771438e86SJunchao Zhang link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 17871438e86SJunchao Zhang } 17971438e86SJunchao Zhang } 18071438e86SJunchao Zhang } 18171438e86SJunchao Zhang 18271438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 18371438e86SJunchao Zhang /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 18471438e86SJunchao Zhang if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 18548a46eb9SPierre Jolivet if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 18671438e86SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 18771438e86SJunchao Zhang } 18871438e86SJunchao Zhang if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 18948a46eb9SPierre Jolivet if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 19071438e86SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 19171438e86SJunchao Zhang } 19271438e86SJunchao Zhang #endif 19371438e86SJunchao Zhang 19471438e86SJunchao Zhang /* Set `current` state of the link. They may change between different SF invocations with the same link */ 19571438e86SJunchao Zhang if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 19671438e86SJunchao Zhang if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 19771438e86SJunchao Zhang if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 19871438e86SJunchao Zhang } 19971438e86SJunchao Zhang 20071438e86SJunchao Zhang link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 20171438e86SJunchao Zhang link->leafdata = leafdata; 20271438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 20371438e86SJunchao Zhang link->rootdirect[i] = rootdirect[i]; 20471438e86SJunchao Zhang link->leafdirect[i] = leafdirect[i]; 20571438e86SJunchao Zhang } 20671438e86SJunchao Zhang link->rootdirect_mpi = rootdirect_mpi; 20771438e86SJunchao Zhang link->leafdirect_mpi = leafdirect_mpi; 20871438e86SJunchao Zhang link->rootmtype = rootmtype; 20971438e86SJunchao Zhang link->leafmtype = leafmtype; 21071438e86SJunchao Zhang link->rootmtype_mpi = rootmtype_mpi; 21171438e86SJunchao Zhang link->leafmtype_mpi = leafmtype_mpi; 21271438e86SJunchao Zhang 21371438e86SJunchao Zhang link->next = bas->inuse; 21471438e86SJunchao Zhang bas->inuse = link; 21571438e86SJunchao Zhang *mylink = link; 2163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21771438e86SJunchao Zhang } 218