xref: /petsc/src/vec/is/sf/impls/basic/sfmpi.c (revision 6677b1c16ddd84d9490cb5064381f6d7f4cdad93)
171438e86SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
271438e86SJunchao Zhang 
3f5d27ee7SJunchao Zhang // Though there is no default machanism to start a communication, we have a
4f5d27ee7SJunchao Zhang // default to finish communication, which is just waiting on the requests.
5f5d27ee7SJunchao Zhang // It should work for both non-blocking or persistent send/recvs or collectivwes.
6f5d27ee7SJunchao Zhang static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7d71ae5a4SJacob Faibussowitsch {
871438e86SJunchao Zhang   PetscSF_Basic     *bas           = (PetscSF_Basic *)sf->data;
971438e86SJunchao Zhang   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
1071438e86SJunchao Zhang   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
1171438e86SJunchao Zhang 
1271438e86SJunchao Zhang   PetscFunctionBegin;
13f5d27ee7SJunchao Zhang   if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
14f5d27ee7SJunchao Zhang   if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
1571438e86SJunchao Zhang   if (direction == PETSCSF_ROOT2LEAF) {
169566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
1771438e86SJunchao Zhang   } else {
189566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
1971438e86SJunchao Zhang   }
203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2171438e86SJunchao Zhang }
2271438e86SJunchao Zhang 
2371438e86SJunchao Zhang /*
2471438e86SJunchao Zhang    The routine Creates a communication link for the given operation. It first looks up its link cache. If
2571438e86SJunchao Zhang    there is a free & suitable one, it uses it. Otherwise it creates a new one.
2671438e86SJunchao Zhang 
2771438e86SJunchao Zhang    A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
2871438e86SJunchao Zhang    root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
2971438e86SJunchao Zhang    can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
3071438e86SJunchao Zhang    those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
3171438e86SJunchao Zhang 
3271438e86SJunchao Zhang    The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
3371438e86SJunchao Zhang 
3471438e86SJunchao Zhang    In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
3571438e86SJunchao Zhang 
3671438e86SJunchao Zhang    The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
3771438e86SJunchao Zhang    need pack/unpack data.
3871438e86SJunchao Zhang */
39d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
40d71ae5a4SJacob Faibussowitsch {
4171438e86SJunchao Zhang   PetscSF_Basic   *bas = (PetscSF_Basic *)sf->data;
4271438e86SJunchao Zhang   PetscInt         i, j, k, nrootreqs, nleafreqs, nreqs;
4371438e86SJunchao Zhang   PetscSFLink     *p, link;
4471438e86SJunchao Zhang   PetscSFDirection direction;
4571438e86SJunchao Zhang   MPI_Request     *reqs = NULL;
4671438e86SJunchao Zhang   PetscBool        match, rootdirect[2], leafdirect[2];
4771438e86SJunchao Zhang   PetscMemType     rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
4871438e86SJunchao Zhang   PetscMemType     leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
4971438e86SJunchao Zhang   PetscMemType     rootmtype_mpi, leafmtype_mpi;   /* mtypes seen by MPI */
5071438e86SJunchao Zhang   PetscInt         rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
5171438e86SJunchao Zhang 
5271438e86SJunchao Zhang   PetscFunctionBegin;
5371438e86SJunchao Zhang   /* Can we directly use root/leafdirect with the given sf, sfop and op? */
5471438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
5571438e86SJunchao Zhang     if (sfop == PETSCSF_BCAST) {
5671438e86SJunchao Zhang       rootdirect[i] = bas->rootcontig[i];                                                  /* Pack roots */
5771438e86SJunchao Zhang       leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
5871438e86SJunchao Zhang     } else if (sfop == PETSCSF_REDUCE) {
5971438e86SJunchao Zhang       leafdirect[i] = sf->leafcontig[i];                                                    /* Pack leaves */
6071438e86SJunchao Zhang       rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
6171438e86SJunchao Zhang     } else {                                                                                /* PETSCSF_FETCH */
6271438e86SJunchao Zhang       rootdirect[i] = PETSC_FALSE;                                                          /* FETCH always need a separate rootbuf */
6371438e86SJunchao Zhang       leafdirect[i] = PETSC_FALSE;                                                          /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
6471438e86SJunchao Zhang     }
6571438e86SJunchao Zhang   }
6671438e86SJunchao Zhang 
67*6677b1c1SJunchao Zhang   // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build
68*6677b1c1SJunchao Zhang   // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect
69*6677b1c1SJunchao Zhang   // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time.
70*6677b1c1SJunchao Zhang   // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call
71*6677b1c1SJunchao Zhang   // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication.
72*6677b1c1SJunchao Zhang   // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives.
73*6677b1c1SJunchao Zhang   if (sf->persistent && sf->collective) {
74*6677b1c1SJunchao Zhang     rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
75*6677b1c1SJunchao Zhang     leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
76*6677b1c1SJunchao Zhang   }
77*6677b1c1SJunchao Zhang 
7871438e86SJunchao Zhang   if (sf->use_gpu_aware_mpi) {
7971438e86SJunchao Zhang     rootmtype_mpi = rootmtype;
8071438e86SJunchao Zhang     leafmtype_mpi = leafmtype;
8171438e86SJunchao Zhang   } else {
8271438e86SJunchao Zhang     rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
8371438e86SJunchao Zhang   }
84da81f932SPierre Jolivet   /* Will root/leafdata be directly accessed by MPI?  Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
8571438e86SJunchao Zhang   rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
8671438e86SJunchao Zhang   leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
8771438e86SJunchao Zhang 
8871438e86SJunchao Zhang   direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
8971438e86SJunchao Zhang   nrootreqs = bas->nrootreqs;
9071438e86SJunchao Zhang   nleafreqs = sf->nleafreqs;
9171438e86SJunchao Zhang 
9271438e86SJunchao Zhang   /* Look for free links in cache */
9371438e86SJunchao Zhang   for (p = &bas->avail; (link = *p); p = &link->next) {
9471438e86SJunchao Zhang     if (!link->use_nvshmem) { /* Only check with MPI links */
959566063dSJacob Faibussowitsch       PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
9671438e86SJunchao Zhang       if (match) {
9771438e86SJunchao Zhang         /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
98*6677b1c1SJunchao Zhang            If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag.
9971438e86SJunchao Zhang         */
10071438e86SJunchao Zhang         if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
10171438e86SJunchao Zhang           reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
1029371c9d4SSatish Balay           for (i = 0; i < nrootreqs; i++) {
1039371c9d4SSatish Balay             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
1049371c9d4SSatish Balay           }
10571438e86SJunchao Zhang           link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
10671438e86SJunchao Zhang         }
10771438e86SJunchao Zhang         if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
10871438e86SJunchao Zhang           reqs = link->leafreqs[direction][leafmtype][1];
1099371c9d4SSatish Balay           for (i = 0; i < nleafreqs; i++) {
1109371c9d4SSatish Balay             if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
1119371c9d4SSatish Balay           }
11271438e86SJunchao Zhang           link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
11371438e86SJunchao Zhang         }
11471438e86SJunchao Zhang         *p = link->next; /* Remove from available list */
11571438e86SJunchao Zhang         goto found;
11671438e86SJunchao Zhang       }
11771438e86SJunchao Zhang     }
11871438e86SJunchao Zhang   }
11971438e86SJunchao Zhang 
1209566063dSJacob Faibussowitsch   PetscCall(PetscNew(&link));
1219566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
1229566063dSJacob Faibussowitsch   PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
12371438e86SJunchao Zhang 
12471438e86SJunchao Zhang   nreqs = (nrootreqs + nleafreqs) * 8;
1259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nreqs, &link->reqs));
12671438e86SJunchao Zhang   for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
12771438e86SJunchao Zhang 
1285c0db29aSPierre Jolivet   if (nreqs)
12971438e86SJunchao Zhang     for (i = 0; i < 2; i++) {     /* Two communication directions */
13071438e86SJunchao Zhang       for (j = 0; j < 2; j++) {   /* Two memory types */
13171438e86SJunchao Zhang         for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
13271438e86SJunchao Zhang           link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
13371438e86SJunchao Zhang           link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
13471438e86SJunchao Zhang         }
13571438e86SJunchao Zhang       }
13671438e86SJunchao Zhang     }
137f5d27ee7SJunchao Zhang 
138f5d27ee7SJunchao Zhang   link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
139f5d27ee7SJunchao Zhang   // each SF type could customize their communication by setting function pointers in the link.
140f5d27ee7SJunchao Zhang   // Currently only BASIC and NEIGHBOR use this abstraction.
141f5d27ee7SJunchao Zhang   PetscTryTypeMethod(sf, SetCommunicationOps, link);
14271438e86SJunchao Zhang 
14371438e86SJunchao Zhang found:
14471438e86SJunchao Zhang 
14571438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
14671438e86SJunchao Zhang   if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
14771438e86SJunchao Zhang   #if defined(PETSC_HAVE_CUDA)
1489566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
14971438e86SJunchao Zhang   #endif
15071438e86SJunchao Zhang   #if defined(PETSC_HAVE_HIP)
1519566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
15271438e86SJunchao Zhang   #endif
15371438e86SJunchao Zhang   #if defined(PETSC_HAVE_KOKKOS)
1549566063dSJacob Faibussowitsch     if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
15571438e86SJunchao Zhang   #endif
15671438e86SJunchao Zhang   }
15771438e86SJunchao Zhang #endif
15871438e86SJunchao Zhang 
15971438e86SJunchao Zhang   /* Allocate buffers along root/leafdata */
16071438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
16171438e86SJunchao Zhang     /* For local communication, buffers are only needed when roots and leaves have different mtypes */
16271438e86SJunchao Zhang     if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
16371438e86SJunchao Zhang     if (bas->rootbuflen[i]) {
16471438e86SJunchao Zhang       if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
16571438e86SJunchao Zhang         link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
16671438e86SJunchao Zhang       } else { /* Have to have a separate rootbuf */
16748a46eb9SPierre Jolivet         if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
16871438e86SJunchao Zhang         link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
16971438e86SJunchao Zhang       }
17071438e86SJunchao Zhang     }
17171438e86SJunchao Zhang 
17271438e86SJunchao Zhang     if (sf->leafbuflen[i]) {
17371438e86SJunchao Zhang       if (leafdirect[i]) {
17471438e86SJunchao Zhang         link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
17571438e86SJunchao Zhang       } else {
17648a46eb9SPierre Jolivet         if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
17771438e86SJunchao Zhang         link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
17871438e86SJunchao Zhang       }
17971438e86SJunchao Zhang     }
18071438e86SJunchao Zhang   }
18171438e86SJunchao Zhang 
18271438e86SJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
18371438e86SJunchao Zhang   /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
18471438e86SJunchao Zhang   if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
18548a46eb9SPierre Jolivet     if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
18671438e86SJunchao Zhang     link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
18771438e86SJunchao Zhang   }
18871438e86SJunchao Zhang   if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
18948a46eb9SPierre Jolivet     if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
19071438e86SJunchao Zhang     link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
19171438e86SJunchao Zhang   }
19271438e86SJunchao Zhang #endif
19371438e86SJunchao Zhang 
19471438e86SJunchao Zhang   /* Set `current` state of the link. They may change between different SF invocations with the same link */
19571438e86SJunchao Zhang   if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
19671438e86SJunchao Zhang     if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
19771438e86SJunchao Zhang     if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
19871438e86SJunchao Zhang   }
19971438e86SJunchao Zhang 
20071438e86SJunchao Zhang   link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
20171438e86SJunchao Zhang   link->leafdata = leafdata;
20271438e86SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
20371438e86SJunchao Zhang     link->rootdirect[i] = rootdirect[i];
20471438e86SJunchao Zhang     link->leafdirect[i] = leafdirect[i];
20571438e86SJunchao Zhang   }
20671438e86SJunchao Zhang   link->rootdirect_mpi = rootdirect_mpi;
20771438e86SJunchao Zhang   link->leafdirect_mpi = leafdirect_mpi;
20871438e86SJunchao Zhang   link->rootmtype      = rootmtype;
20971438e86SJunchao Zhang   link->leafmtype      = leafmtype;
21071438e86SJunchao Zhang   link->rootmtype_mpi  = rootmtype_mpi;
21171438e86SJunchao Zhang   link->leafmtype_mpi  = leafmtype_mpi;
21271438e86SJunchao Zhang 
21371438e86SJunchao Zhang   link->next = bas->inuse;
21471438e86SJunchao Zhang   bas->inuse = link;
21571438e86SJunchao Zhang   *mylink    = link;
2163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21771438e86SJunchao Zhang }
218