171438e86SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h> 271438e86SJunchao Zhang 3f5d27ee7SJunchao Zhang // Though there is no default machanism to start a communication, we have a 4f5d27ee7SJunchao Zhang // default to finish communication, which is just waiting on the requests. 5f5d27ee7SJunchao Zhang // It should work for both non-blocking or persistent send/recvs or collectivwes. 6f5d27ee7SJunchao Zhang static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 7d71ae5a4SJacob Faibussowitsch { 871438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 971438e86SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; 1071438e86SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 1171438e86SJunchao Zhang 1271438e86SJunchao Zhang PetscFunctionBegin; 13f5d27ee7SJunchao Zhang if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE)); 14f5d27ee7SJunchao Zhang if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE)); 1571438e86SJunchao Zhang if (direction == PETSCSF_ROOT2LEAF) { 169566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */)); 1771438e86SJunchao Zhang } else { 189566063dSJacob Faibussowitsch PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE)); 1971438e86SJunchao Zhang } 203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2171438e86SJunchao Zhang } 2271438e86SJunchao Zhang 2371438e86SJunchao Zhang /* 2471438e86SJunchao Zhang The routine Creates a communication link for the given operation. It first looks up its link cache. If 2571438e86SJunchao Zhang there is a free & suitable one, it uses it. Otherwise it creates a new one. 2671438e86SJunchao Zhang 2771438e86SJunchao Zhang A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack 2871438e86SJunchao Zhang root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata 2971438e86SJunchao Zhang can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate 3071438e86SJunchao Zhang those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. 3171438e86SJunchao Zhang 3271438e86SJunchao Zhang The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. 3371438e86SJunchao Zhang 3471438e86SJunchao Zhang In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. 3571438e86SJunchao Zhang 3671438e86SJunchao Zhang The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and 3771438e86SJunchao Zhang need pack/unpack data. 3871438e86SJunchao Zhang */ 39d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink) 40d71ae5a4SJacob Faibussowitsch { 4171438e86SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 4271438e86SJunchao Zhang PetscInt i, j, k, nrootreqs, nleafreqs, nreqs; 4371438e86SJunchao Zhang PetscSFLink *p, link; 4471438e86SJunchao Zhang PetscSFDirection direction; 4571438e86SJunchao Zhang MPI_Request *reqs = NULL; 4671438e86SJunchao Zhang PetscBool match, rootdirect[2], leafdirect[2]; 4771438e86SJunchao Zhang PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */ 4871438e86SJunchao Zhang PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; 4971438e86SJunchao Zhang PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */ 5071438e86SJunchao Zhang PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/ 5171438e86SJunchao Zhang 5271438e86SJunchao Zhang PetscFunctionBegin; 5371438e86SJunchao Zhang /* Can we directly use root/leafdirect with the given sf, sfop and op? */ 5471438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 5571438e86SJunchao Zhang if (sfop == PETSCSF_BCAST) { 5671438e86SJunchao Zhang rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ 5771438e86SJunchao Zhang leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ 5871438e86SJunchao Zhang } else if (sfop == PETSCSF_REDUCE) { 5971438e86SJunchao Zhang leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ 6071438e86SJunchao Zhang rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ 6171438e86SJunchao Zhang } else { /* PETSCSF_FETCH */ 6271438e86SJunchao Zhang rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ 6371438e86SJunchao Zhang leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ 6471438e86SJunchao Zhang } 6571438e86SJunchao Zhang } 6671438e86SJunchao Zhang 67*6677b1c1SJunchao Zhang // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build 68*6677b1c1SJunchao Zhang // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect 69*6677b1c1SJunchao Zhang // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time. 70*6677b1c1SJunchao Zhang // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call 71*6677b1c1SJunchao Zhang // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication. 72*6677b1c1SJunchao Zhang // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives. 73*6677b1c1SJunchao Zhang if (sf->persistent && sf->collective) { 74*6677b1c1SJunchao Zhang rootdirect[PETSCSF_REMOTE] = PETSC_FALSE; 75*6677b1c1SJunchao Zhang leafdirect[PETSCSF_REMOTE] = PETSC_FALSE; 76*6677b1c1SJunchao Zhang } 77*6677b1c1SJunchao Zhang 7871438e86SJunchao Zhang if (sf->use_gpu_aware_mpi) { 7971438e86SJunchao Zhang rootmtype_mpi = rootmtype; 8071438e86SJunchao Zhang leafmtype_mpi = leafmtype; 8171438e86SJunchao Zhang } else { 8271438e86SJunchao Zhang rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; 8371438e86SJunchao Zhang } 84da81f932SPierre Jolivet /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */ 8571438e86SJunchao Zhang rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0; 8671438e86SJunchao Zhang leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0; 8771438e86SJunchao Zhang 8871438e86SJunchao Zhang direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; 8971438e86SJunchao Zhang nrootreqs = bas->nrootreqs; 9071438e86SJunchao Zhang nleafreqs = sf->nleafreqs; 9171438e86SJunchao Zhang 9271438e86SJunchao Zhang /* Look for free links in cache */ 9371438e86SJunchao Zhang for (p = &bas->avail; (link = *p); p = &link->next) { 9471438e86SJunchao Zhang if (!link->use_nvshmem) { /* Only check with MPI links */ 959566063dSJacob Faibussowitsch PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match)); 9671438e86SJunchao Zhang if (match) { 9771438e86SJunchao Zhang /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current. 98*6677b1c1SJunchao Zhang If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag. 9971438e86SJunchao Zhang */ 10071438e86SJunchao Zhang if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { 10171438e86SJunchao Zhang reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ 1029371c9d4SSatish Balay for (i = 0; i < nrootreqs; i++) { 1039371c9d4SSatish Balay if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 1049371c9d4SSatish Balay } 10571438e86SJunchao Zhang link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE; 10671438e86SJunchao Zhang } 10771438e86SJunchao Zhang if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { 10871438e86SJunchao Zhang reqs = link->leafreqs[direction][leafmtype][1]; 1099371c9d4SSatish Balay for (i = 0; i < nleafreqs; i++) { 1109371c9d4SSatish Balay if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i])); 1119371c9d4SSatish Balay } 11271438e86SJunchao Zhang link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE; 11371438e86SJunchao Zhang } 11471438e86SJunchao Zhang *p = link->next; /* Remove from available list */ 11571438e86SJunchao Zhang goto found; 11671438e86SJunchao Zhang } 11771438e86SJunchao Zhang } 11871438e86SJunchao Zhang } 11971438e86SJunchao Zhang 1209566063dSJacob Faibussowitsch PetscCall(PetscNew(&link)); 1219566063dSJacob Faibussowitsch PetscCall(PetscSFLinkSetUp_Host(sf, link, unit)); 1229566063dSJacob Faibussowitsch PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */ 12371438e86SJunchao Zhang 12471438e86SJunchao Zhang nreqs = (nrootreqs + nleafreqs) * 8; 1259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nreqs, &link->reqs)); 12671438e86SJunchao Zhang for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ 12771438e86SJunchao Zhang 1285c0db29aSPierre Jolivet if (nreqs) 12971438e86SJunchao Zhang for (i = 0; i < 2; i++) { /* Two communication directions */ 13071438e86SJunchao Zhang for (j = 0; j < 2; j++) { /* Two memory types */ 13171438e86SJunchao Zhang for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */ 13271438e86SJunchao Zhang link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k); 13371438e86SJunchao Zhang link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k); 13471438e86SJunchao Zhang } 13571438e86SJunchao Zhang } 13671438e86SJunchao Zhang } 137f5d27ee7SJunchao Zhang 138f5d27ee7SJunchao Zhang link->FinishCommunication = PetscSFLinkFinishCommunication_Default; 139f5d27ee7SJunchao Zhang // each SF type could customize their communication by setting function pointers in the link. 140f5d27ee7SJunchao Zhang // Currently only BASIC and NEIGHBOR use this abstraction. 141f5d27ee7SJunchao Zhang PetscTryTypeMethod(sf, SetCommunicationOps, link); 14271438e86SJunchao Zhang 14371438e86SJunchao Zhang found: 14471438e86SJunchao Zhang 14571438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 14671438e86SJunchao Zhang if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { 14771438e86SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 1489566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */ 14971438e86SJunchao Zhang #endif 15071438e86SJunchao Zhang #if defined(PETSC_HAVE_HIP) 1519566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */ 15271438e86SJunchao Zhang #endif 15371438e86SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS) 1549566063dSJacob Faibussowitsch if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit)); 15571438e86SJunchao Zhang #endif 15671438e86SJunchao Zhang } 15771438e86SJunchao Zhang #endif 15871438e86SJunchao Zhang 15971438e86SJunchao Zhang /* Allocate buffers along root/leafdata */ 16071438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 16171438e86SJunchao Zhang /* For local communication, buffers are only needed when roots and leaves have different mtypes */ 16271438e86SJunchao Zhang if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; 16371438e86SJunchao Zhang if (bas->rootbuflen[i]) { 16471438e86SJunchao Zhang if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ 16571438e86SJunchao Zhang link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes; 16671438e86SJunchao Zhang } else { /* Have to have a separate rootbuf */ 16748a46eb9SPierre Jolivet if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype])); 16871438e86SJunchao Zhang link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; 16971438e86SJunchao Zhang } 17071438e86SJunchao Zhang } 17171438e86SJunchao Zhang 17271438e86SJunchao Zhang if (sf->leafbuflen[i]) { 17371438e86SJunchao Zhang if (leafdirect[i]) { 17471438e86SJunchao Zhang link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes; 17571438e86SJunchao Zhang } else { 17648a46eb9SPierre Jolivet if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype])); 17771438e86SJunchao Zhang link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; 17871438e86SJunchao Zhang } 17971438e86SJunchao Zhang } 18071438e86SJunchao Zhang } 18171438e86SJunchao Zhang 18271438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 18371438e86SJunchao Zhang /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ 18471438e86SJunchao Zhang if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) { 18548a46eb9SPierre Jolivet if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 18671438e86SJunchao Zhang link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 18771438e86SJunchao Zhang } 18871438e86SJunchao Zhang if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) { 18948a46eb9SPierre Jolivet if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST])); 19071438e86SJunchao Zhang link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 19171438e86SJunchao Zhang } 19271438e86SJunchao Zhang #endif 19371438e86SJunchao Zhang 19471438e86SJunchao Zhang /* Set `current` state of the link. They may change between different SF invocations with the same link */ 19571438e86SJunchao Zhang if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ 19671438e86SJunchao Zhang if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; 19771438e86SJunchao Zhang if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; 19871438e86SJunchao Zhang } 19971438e86SJunchao Zhang 20071438e86SJunchao Zhang link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ 20171438e86SJunchao Zhang link->leafdata = leafdata; 20271438e86SJunchao Zhang for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) { 20371438e86SJunchao Zhang link->rootdirect[i] = rootdirect[i]; 20471438e86SJunchao Zhang link->leafdirect[i] = leafdirect[i]; 20571438e86SJunchao Zhang } 20671438e86SJunchao Zhang link->rootdirect_mpi = rootdirect_mpi; 20771438e86SJunchao Zhang link->leafdirect_mpi = leafdirect_mpi; 20871438e86SJunchao Zhang link->rootmtype = rootmtype; 20971438e86SJunchao Zhang link->leafmtype = leafmtype; 21071438e86SJunchao Zhang link->rootmtype_mpi = rootmtype_mpi; 21171438e86SJunchao Zhang link->leafmtype_mpi = leafmtype_mpi; 21271438e86SJunchao Zhang 21371438e86SJunchao Zhang link->next = bas->inuse; 21471438e86SJunchao Zhang bas->inuse = link; 21571438e86SJunchao Zhang *mylink = link; 2163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 21771438e86SJunchao Zhang } 218