xref: /petsc/src/vec/is/sf/impls/basic/neighbor/sfneighbor.c (revision a29dfd43bb0c77e2653d3bfa2c953f902720a6d2)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 #include <petscpkg_version.h>
4 
5 /* Convenience local types */
6 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
7 typedef MPI_Count PetscSFCount;
8 typedef MPI_Aint  PetscSFAint;
9 #else
10 typedef PetscMPIInt PetscSFCount;
11 typedef PetscMPIInt PetscSFAint;
12 #endif
13 
14 typedef struct {
15   SFBASICHEADER;
16   MPI_Comm      comms[2];                /* Communicators with distributed topology in both directions */
17   PetscBool     initialized[2];          /* Are the two communicators initialized? */
18   PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */
19   PetscSFAint  *rootdispls, *leafdispls; /* displs for non-distinguished ranks */
20   PetscMPIInt  *rootweights, *leafweights;
21   PetscInt      rootdegree, leafdegree;
22 } PetscSF_Neighbor;
23 
24 /*===================================================================================*/
25 /*              Internal utility routines                                            */
26 /*===================================================================================*/
27 
28 static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype)
29 {
30   PetscFunctionBegin;
31   if (PetscDefined(USE_LOG)) {
32     petsc_isend_ct += (PetscLogDouble)nsend;
33     petsc_irecv_ct += (PetscLogDouble)nrecv;
34 
35     if (sendtype != MPI_DATATYPE_NULL) {
36       PetscMPIInt i, typesize;
37       PetscCallMPI(MPI_Type_size(sendtype, &typesize));
38       for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize);
39     }
40 
41     if (recvtype != MPI_DATATYPE_NULL) {
42       PetscMPIInt i, typesize;
43       PetscCallMPI(MPI_Type_size(recvtype, &typesize));
44       for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize);
45     }
46   }
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */
51 static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm)
52 {
53   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
54 
55   PetscFunctionBegin;
56   if (!dat->initialized[direction]) {
57     PetscInt           nrootranks, ndrootranks, nleafranks, ndleafranks;
58     PetscMPIInt        indegree, outdegree;
59     const PetscMPIInt *rootranks, *leafranks, *sources, *destinations;
60     MPI_Comm           comm, *mycomm = &dat->comms[direction];
61 
62     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL));       /* Which ranks will access my roots (I am a destination) */
63     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */
64     indegree     = nrootranks - ndrootranks;
65     outdegree    = nleafranks - ndleafranks;
66     sources      = PetscSafePointerPlusOffset(rootranks, ndrootranks);
67     destinations = PetscSafePointerPlusOffset(leafranks, ndleafranks);
68     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
69     if (direction == PETSCSF_LEAF2ROOT) {
70       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
71     } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */
72       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
73     }
74     dat->initialized[direction] = PETSC_TRUE;
75   }
76   *distcomm = dat->comms[direction];
77   PetscFunctionReturn(PETSC_SUCCESS);
78 }
79 
80 // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication)
81 static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
82 {
83   PetscSF_Neighbor *dat      = (PetscSF_Neighbor *)sf->data;
84   MPI_Comm          distcomm = MPI_COMM_NULL;
85   void             *rootbuf = NULL, *leafbuf = NULL;
86   MPI_Request      *req = NULL;
87 
88   PetscFunctionBegin;
89   if (direction == PETSCSF_ROOT2LEAF) {
90     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
91   } else {
92     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
93   }
94 
95   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
96   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL));
97   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
98 
99   if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case
100     if (direction == PETSCSF_ROOT2LEAF) {
101       PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req));
102       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
103     } else {
104       PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req));
105       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
106     }
107   }
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
112 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
113 {
114   PetscSF_Neighbor  *dat           = (PetscSF_Neighbor *)sf->data;
115   MPI_Comm           distcomm      = MPI_COMM_NULL;
116   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
117   const PetscInt     rootdirect_mpi = link->rootdirect_mpi;
118   MPI_Request       *req            = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
119   void              *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
120   MPI_Info           info;
121 
122   PetscFunctionBegin;
123   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
124   if (dat->rootdegree || dat->leafdegree) {
125     if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
126       PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info
127       if (direction == PETSCSF_ROOT2LEAF) {
128         PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req));
129       } else {
130         PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req));
131       }
132       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
133       PetscCallMPI(MPI_Info_free(&info));
134     }
135   }
136   PetscFunctionReturn(PETSC_SUCCESS);
137 }
138 
139 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
140 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
141 {
142   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
143   MPI_Request      *req = NULL;
144 
145   PetscFunctionBegin;
146   if (direction == PETSCSF_ROOT2LEAF) {
147     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
148   } else {
149     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
150   }
151 
152   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL));
153   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link));
154   if (dat->rootdegree || dat->leafdegree) {
155     PetscCallMPI(MPI_Start(req));
156     if (direction == PETSCSF_ROOT2LEAF) {
157       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
158     } else {
159       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
160     }
161   }
162   PetscFunctionReturn(PETSC_SUCCESS);
163 }
164 #endif
165 
166 static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link)
167 {
168   PetscFunctionBegin;
169 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
170   if (sf->persistent) {
171     link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Neighbor;
172     link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor;
173   } else
174 #endif
175   {
176     link->StartCommunication = PetscSFLinkStartCommunication_Neighbor;
177   }
178   PetscFunctionReturn(PETSC_SUCCESS);
179 }
180 
181 /*===================================================================================*/
182 /*              Implementations of SF public APIs                                    */
183 /*===================================================================================*/
184 static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf)
185 {
186   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
187   PetscInt          i, j, nrootranks, ndrootranks, nleafranks, ndleafranks;
188   const PetscInt   *rootoffset, *leafoffset;
189   PetscMPIInt       m, n, m2, n2;
190 
191   PetscFunctionBegin;
192   /* SFNeighbor inherits from Basic */
193   PetscCall(PetscSFSetUp_Basic(sf));
194   /* SFNeighbor specific */
195   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
196   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
197   dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks);
198   dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks);
199   sf->nleafreqs       = 0;
200   dat->nrootreqs      = 1; // collectives only need one MPI_Request. We just put it in rootreqs[]
201 
202   m2 = m;
203   n2 = n;
204 #if defined(PETSC_HAVE_OPENMPI) // workaround for an OpenMPI 5.0.x bug, https://github.com/open-mpi/ompi/pull/12614
205   #if PETSC_PKG_OPENMPI_VERSION_LE(5, 0, 3)
206   m2 = m ? m : 1;
207   n2 = n ? n : 1;
208   #endif
209 #endif
210   // Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory
211   PetscCall(PetscMalloc6(m2, &dat->rootdispls, m2, &dat->rootcounts, m2, &dat->rootweights, n2, &dat->leafdispls, n2, &dat->leafcounts, n2, &dat->leafweights));
212 
213 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
214   for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
215     dat->rootdispls[j]  = rootoffset[i] - rootoffset[ndrootranks];
216     dat->rootcounts[j]  = rootoffset[i + 1] - rootoffset[i];
217     dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); /* Scale to range of PetscMPIInt */
218   }
219 
220   for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
221     dat->leafdispls[j]  = leafoffset[i] - leafoffset[ndleafranks];
222     dat->leafcounts[j]  = leafoffset[i + 1] - leafoffset[i];
223     dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647);
224   }
225 #else
226   for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
227     PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m));
228     dat->rootdispls[j] = m;
229     PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n));
230     dat->rootcounts[j]  = n;
231     dat->rootweights[j] = n;
232   }
233 
234   for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
235     PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m));
236     dat->leafdispls[j] = m;
237     PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n));
238     dat->leafcounts[j]  = n;
239     dat->leafweights[j] = n;
240   }
241 #endif
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 
245 static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf)
246 {
247   PetscInt          i;
248   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
249 
250   PetscFunctionBegin;
251   PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
252   PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights));
253   for (i = 0; i < 2; i++) {
254     if (dat->initialized[i]) {
255       PetscCallMPI(MPI_Comm_free(&dat->comms[i]));
256       dat->initialized[i] = PETSC_FALSE;
257     }
258   }
259   PetscCall(PetscSFReset_Basic(sf)); /* Common part */
260   PetscFunctionReturn(PETSC_SUCCESS);
261 }
262 
263 static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf)
264 {
265   PetscFunctionBegin;
266   PetscCall(PetscSFReset_Neighbor(sf));
267   PetscCall(PetscFree(sf->data));
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf)
272 {
273   PetscSF_Neighbor *dat;
274 
275   PetscFunctionBegin;
276   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
277   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
278   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
279   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
280   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
281   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
282   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
283   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
284   sf->ops->View                 = PetscSFView_Basic;
285 
286   sf->ops->SetUp               = PetscSFSetUp_Neighbor;
287   sf->ops->Reset               = PetscSFReset_Neighbor;
288   sf->ops->Destroy             = PetscSFDestroy_Neighbor;
289   sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor;
290 
291 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
292   PetscObjectOptionsBegin((PetscObject)sf);
293   PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL));
294   PetscOptionsEnd();
295 #endif
296   sf->collective = PETSC_TRUE;
297 
298   PetscCall(PetscNew(&dat));
299   sf->data = (void *)dat;
300   PetscFunctionReturn(PETSC_SUCCESS);
301 }
302