xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision f5d27ee7dde11c933a8c6f6ef7ab9e5456705271)
171438e86SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
271438e86SJunchao Zhang 
3*f5d27ee7SJunchao Zhang // Though there is no default machanism to start a communication, we have a
4*f5d27ee7SJunchao Zhang // default to finish communication, which is just waiting on the requests.
5*f5d27ee7SJunchao Zhang // It should work for both non-blocking or persistent send/recvs or collectivwes.
6*f5d27ee7SJunchao Zhang static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7d71ae5a4SJacob Faibussowitsch {
871438e86SJunchao Zhang   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
971438e86SJunchao Zhang   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
1071438e86SJunchao Zhang   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
1171438e86SJunchao Zhang 
1271438e86SJunchao Zhang   PetscFunctionBegin;
13*f5d27ee7SJunchao Zhang   if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
14*f5d27ee7SJunchao Zhang   if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
1571438e86SJunchao Zhang   if (direction == PETSCSF_ROOT2LEAF) {
169566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
1771438e86SJunchao Zhang   } else {
189566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
1971438e86SJunchao Zhang   }
203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2171438e86SJunchao Zhang }
2271438e86SJunchao Zhang 
2371438e86SJunchao Zhang /*
2471438e86SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
2571438e86SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
2671438e86SJunchao Zhang 
2771438e86SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
2871438e86SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
2971438e86SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
3071438e86SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
3171438e86SJunchao Zhang 
3271438e86SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
3371438e86SJunchao Zhang 
3471438e86SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
3571438e86SJunchao Zhang 
3671438e86SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
3771438e86SJunchao Zhang    need pack/unpack data.
3871438e86SJunchao Zhang */
39d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
40d71ae5a4SJacob Faibussowitsch {
4171438e86SJunchao Zhang   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
4271438e86SJunchao Zhang   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
4371438e86SJunchao Zhang   PetscSFLink     *p, link;
4471438e86SJunchao Zhang   PetscSFDirection direction;
4571438e86SJunchao Zhang   MPI_Request     *reqs = NULL;
4671438e86SJunchao Zhang   PetscBool        match, rootdirect[2], leafdirect[2];
4771438e86SJunchao Zhang   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
4871438e86SJunchao Zhang   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
4971438e86SJunchao Zhang   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
5071438e86SJunchao Zhang   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
5171438e86SJunchao Zhang 
5271438e86SJunchao Zhang   PetscFunctionBegin;
5371438e86SJunchao Zhang 
5471438e86SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
5571438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
5671438e86SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
5771438e86SJunchao Zhang       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
5871438e86SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
5971438e86SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
6071438e86SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
6171438e86SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
6271438e86SJunchao Zhang     } else {                                                                                /* PETSCSF_FETCH */
6371438e86SJunchao Zhang       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
6471438e86SJunchao Zhang       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
6571438e86SJunchao Zhang     }
6671438e86SJunchao Zhang   }
6771438e86SJunchao Zhang 
6871438e86SJunchao Zhang   if (sf->use_gpu_aware_mpi) {
6971438e86SJunchao Zhang     rootmtype_mpi = rootmtype;
7071438e86SJunchao Zhang     leafmtype_mpi = leafmtype;
7171438e86SJunchao Zhang   } else {
7271438e86SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
7371438e86SJunchao Zhang   }
74da81f932SPierre Jolivet   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
7571438e86SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
7671438e86SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
7771438e86SJunchao Zhang 
7871438e86SJunchao Zhang   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
7971438e86SJunchao Zhang   nrootreqs = bas->nrootreqs;
8071438e86SJunchao Zhang   nleafreqs = sf->nleafreqs;
8171438e86SJunchao Zhang 
8271438e86SJunchao Zhang   /* Look for free links in cache */
8371438e86SJunchao Zhang   for (p = &bas->avail; (link = *p); p = &link->next) {
8471438e86SJunchao Zhang     if (!link->use_nvshmem) { /* Only check with MPI links */
859566063dSJacob Faibussowitsch       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
8671438e86SJunchao Zhang       if (match) {
8771438e86SJunchao Zhang         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
8871438e86SJunchao Zhang            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
8971438e86SJunchao Zhang         */
9071438e86SJunchao Zhang         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
9171438e86SJunchao Zhang           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
929371c9d4SSatish Balay           for (i = 0; i < nrootreqs; i++) {
939371c9d4SSatish Balay             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
949371c9d4SSatish Balay           }
9571438e86SJunchao Zhang           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
9671438e86SJunchao Zhang         }
9771438e86SJunchao Zhang         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
9871438e86SJunchao Zhang           reqs = link->leafreqs[direction][leafmtype][1];
999371c9d4SSatish Balay           for (i = 0; i < nleafreqs; i++) {
1009371c9d4SSatish Balay             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
1019371c9d4SSatish Balay           }
10271438e86SJunchao Zhang           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
10371438e86SJunchao Zhang         }
10471438e86SJunchao Zhang         *p = link->next; /* Remove from available list */
10571438e86SJunchao Zhang         goto found;
10671438e86SJunchao Zhang       }
10771438e86SJunchao Zhang     }
10871438e86SJunchao Zhang   }
10971438e86SJunchao Zhang 
1109566063dSJacob Faibussowitsch   PetscCall(PetscNew(&link));
1119566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
1129566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
11371438e86SJunchao Zhang 
11471438e86SJunchao Zhang   nreqs = (nrootreqs + nleafreqs) * 8;
1159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nreqs, &link->reqs));
11671438e86SJunchao Zhang   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
11771438e86SJunchao Zhang 
1185c0db29aSPierre Jolivet   if (nreqs)
11971438e86SJunchao Zhang     for (i = 0; i < 2; i++) {     /* Two communication directions */
12071438e86SJunchao Zhang       for (j = 0; j < 2; j++) {   /* Two memory types */
12171438e86SJunchao Zhang         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
12271438e86SJunchao Zhang           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
12371438e86SJunchao Zhang           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
12471438e86SJunchao Zhang         }
12571438e86SJunchao Zhang       }
12671438e86SJunchao Zhang     }
127*f5d27ee7SJunchao Zhang 
128*f5d27ee7SJunchao Zhang   link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
129*f5d27ee7SJunchao Zhang   // each SF type could customize their communication by setting function pointers in the link.
130*f5d27ee7SJunchao Zhang   // Currently only BASIC and NEIGHBOR use this abstraction.
131*f5d27ee7SJunchao Zhang   PetscTryTypeMethod(sf, SetCommunicationOps, link);
13271438e86SJunchao Zhang 
13371438e86SJunchao Zhang found:
13471438e86SJunchao Zhang 
13571438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
13671438e86SJunchao Zhang   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
13771438e86SJunchao Zhang   #if defined(PETSC_HAVE_CUDA)
1389566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
13971438e86SJunchao Zhang   #endif
14071438e86SJunchao Zhang   #if defined(PETSC_HAVE_HIP)
1419566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
14271438e86SJunchao Zhang   #endif
14371438e86SJunchao Zhang   #if defined(PETSC_HAVE_KOKKOS)
1449566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
14571438e86SJunchao Zhang   #endif
14671438e86SJunchao Zhang   }
14771438e86SJunchao Zhang #endif
14871438e86SJunchao Zhang 
14971438e86SJunchao Zhang   /* Allocate buffers along root/leafdata */
15071438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
15171438e86SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
15271438e86SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
15371438e86SJunchao Zhang     if (bas->rootbuflen[i]) {
15471438e86SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
15571438e86SJunchao Zhang         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
15671438e86SJunchao Zhang       } else { /* Have to have a separate rootbuf */
15748a46eb9SPierre Jolivet         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
15871438e86SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
15971438e86SJunchao Zhang       }
16071438e86SJunchao Zhang     }
16171438e86SJunchao Zhang 
16271438e86SJunchao Zhang     if (sf->leafbuflen[i]) {
16371438e86SJunchao Zhang       if (leafdirect[i]) {
16471438e86SJunchao Zhang         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
16571438e86SJunchao Zhang       } else {
16648a46eb9SPierre Jolivet         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
16771438e86SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
16871438e86SJunchao Zhang       }
16971438e86SJunchao Zhang     }
17071438e86SJunchao Zhang   }
17171438e86SJunchao Zhang 
17271438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
17371438e86SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
17471438e86SJunchao Zhang   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
17548a46eb9SPierre Jolivet     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
17671438e86SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
17771438e86SJunchao Zhang   }
17871438e86SJunchao Zhang   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
17948a46eb9SPierre Jolivet     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
18071438e86SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
18171438e86SJunchao Zhang   }
18271438e86SJunchao Zhang #endif
18371438e86SJunchao Zhang 
18471438e86SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
18571438e86SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
18671438e86SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
18771438e86SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
18871438e86SJunchao Zhang   }
18971438e86SJunchao Zhang 
19071438e86SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
19171438e86SJunchao Zhang   link->leafdata = leafdata;
19271438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
19371438e86SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
19471438e86SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
19571438e86SJunchao Zhang   }
19671438e86SJunchao Zhang   link->rootdirect_mpi = rootdirect_mpi;
19771438e86SJunchao Zhang   link->leafdirect_mpi = leafdirect_mpi;
19871438e86SJunchao Zhang   link->rootmtype      = rootmtype;
19971438e86SJunchao Zhang   link->leafmtype      = leafmtype;
20071438e86SJunchao Zhang   link->rootmtype_mpi  = rootmtype_mpi;
20171438e86SJunchao Zhang   link->leafmtype_mpi  = leafmtype_mpi;
20271438e86SJunchao Zhang 
20371438e86SJunchao Zhang   link->next = bas->inuse;
20471438e86SJunchao Zhang   bas->inuse = link;
20571438e86SJunchao Zhang   *mylink    = link;
2063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20771438e86SJunchao Zhang }
208