xref: /petsc/src/vec/is/sf/impls/basic/neighbor/sfneighbor.c (revision ecceeb7d86a3b9d2c0da2aced471d46acf67b452)
1 #include <../src/vec/is/sf/impls/basic/sfpack.h>
2 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
3 
4 /* Convenience local types */
5 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
6 typedef MPI_Count PetscSFCount;
7 typedef MPI_Aint  PetscSFAint;
8 #else
9 typedef PetscMPIInt PetscSFCount;
10 typedef PetscMPIInt PetscSFAint;
11 #endif
12 
13 typedef struct {
14   SFBASICHEADER;
15   MPI_Comm      comms[2];                /* Communicators with distributed topology in both directions */
16   PetscBool     initialized[2];          /* Are the two communicators initialized? */
17   PetscSFCount *rootcounts, *leafcounts; /* counts for non-distinguished ranks */
18   PetscSFAint  *rootdispls, *leafdispls; /* displs for non-distinguished ranks */
19   PetscMPIInt  *rootweights, *leafweights;
20   PetscInt      rootdegree, leafdegree;
21 } PetscSF_Neighbor;
22 
23 /*===================================================================================*/
24 /*              Internal utility routines                                            */
25 /*===================================================================================*/
26 
27 static inline PetscErrorCode PetscLogMPIMessages(PetscInt nsend, PetscSFCount *sendcnts, MPI_Datatype sendtype, PetscInt nrecv, PetscSFCount *recvcnts, MPI_Datatype recvtype)
28 {
29   PetscFunctionBegin;
30   if (PetscDefined(USE_LOG)) {
31     petsc_isend_ct += (PetscLogDouble)nsend;
32     petsc_irecv_ct += (PetscLogDouble)nrecv;
33 
34     if (sendtype != MPI_DATATYPE_NULL) {
35       PetscMPIInt i, typesize;
36       PetscCallMPI(MPI_Type_size(sendtype, &typesize));
37       for (i = 0; i < nsend; i++) petsc_isend_len += (PetscLogDouble)(sendcnts[i] * typesize);
38     }
39 
40     if (recvtype != MPI_DATATYPE_NULL) {
41       PetscMPIInt i, typesize;
42       PetscCallMPI(MPI_Type_size(recvtype, &typesize));
43       for (i = 0; i < nrecv; i++) petsc_irecv_len += (PetscLogDouble)(recvcnts[i] * typesize);
44     }
45   }
46   PetscFunctionReturn(PETSC_SUCCESS);
47 }
48 
49 /* Get the communicator with distributed graph topology, which is not cheap to build so we do it on demand (instead of at PetscSFSetUp time) */
50 static PetscErrorCode PetscSFGetDistComm_Neighbor(PetscSF sf, PetscSFDirection direction, MPI_Comm *distcomm)
51 {
52   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
53 
54   PetscFunctionBegin;
55   if (!dat->initialized[direction]) {
56     PetscInt           nrootranks, ndrootranks, nleafranks, ndleafranks;
57     PetscMPIInt        indegree, outdegree;
58     const PetscMPIInt *rootranks, *leafranks, *sources, *destinations;
59     MPI_Comm           comm, *mycomm = &dat->comms[direction];
60 
61     PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, &rootranks, NULL, NULL));       /* Which ranks will access my roots (I am a destination) */
62     PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, &leafranks, NULL, NULL, NULL)); /* My leaves will access whose roots (I am a source) */
63     indegree     = nrootranks - ndrootranks;
64     outdegree    = nleafranks - ndleafranks;
65     sources      = rootranks + ndrootranks;
66     destinations = leafranks + ndleafranks;
67     PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
68     if (direction == PETSCSF_LEAF2ROOT) {
69       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, indegree, sources, dat->rootweights, outdegree, destinations, dat->leafweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
70     } else { /* PETSCSF_ROOT2LEAF, reverse src & dest */
71       PetscCallMPI(MPI_Dist_graph_create_adjacent(comm, outdegree, destinations, dat->leafweights, indegree, sources, dat->rootweights, MPI_INFO_NULL, 1 /*reorder*/, mycomm));
72     }
73     dat->initialized[direction] = PETSC_TRUE;
74   }
75   *distcomm = dat->comms[direction];
76   PetscFunctionReturn(PETSC_SUCCESS);
77 }
78 
79 // start MPI_Ineighbor_alltoallv (only used for inter-proccess communication)
80 static PetscErrorCode PetscSFLinkStartCommunication_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
81 {
82   PetscSF_Neighbor *dat      = (PetscSF_Neighbor *)sf->data;
83   MPI_Comm          distcomm = MPI_COMM_NULL;
84   void             *rootbuf = NULL, *leafbuf = NULL;
85   MPI_Request      *req = NULL;
86 
87   PetscFunctionBegin;
88   if (direction == PETSCSF_ROOT2LEAF) {
89     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
90   } else {
91     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
92   }
93 
94   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
95   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, &rootbuf, &leafbuf, &req, NULL));
96   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
97 
98   if (dat->rootdegree || dat->leafdegree) { // OpenMPI-3.0 ran into error with rootdegree = leafdegree = 0, so we skip the call in this case
99     if (direction == PETSCSF_ROOT2LEAF) {
100       PetscCallMPI(MPIU_Ineighbor_alltoallv(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, req));
101       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
102     } else {
103       PetscCallMPI(MPIU_Ineighbor_alltoallv(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, req));
104       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
105     }
106   }
107   PetscFunctionReturn(PETSC_SUCCESS);
108 }
109 
110 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
111 static PetscErrorCode PetscSFLinkInitMPIRequests_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
112 {
113   PetscSF_Neighbor  *dat           = (PetscSF_Neighbor *)sf->data;
114   MPI_Comm           distcomm      = MPI_COMM_NULL;
115   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
116   const PetscInt     rootdirect_mpi = link->rootdirect_mpi;
117   MPI_Request       *req            = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
118   void              *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi], *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
119   MPI_Info           info;
120 
121   PetscFunctionBegin;
122   PetscCall(PetscSFGetDistComm_Neighbor(sf, direction, &distcomm));
123   if (dat->rootdegree || dat->leafdegree) {
124     if (!link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
125       PetscCallMPI(MPI_Info_create(&info)); // currently, we don't use info
126       if (direction == PETSCSF_ROOT2LEAF) {
127         PetscCallMPI(MPIU_Neighbor_alltoallv_init(rootbuf, dat->rootcounts, dat->rootdispls, link->unit, leafbuf, dat->leafcounts, dat->leafdispls, link->unit, distcomm, info, req));
128       } else {
129         PetscCallMPI(MPIU_Neighbor_alltoallv_init(leafbuf, dat->leafcounts, dat->leafdispls, link->unit, rootbuf, dat->rootcounts, dat->rootdispls, link->unit, distcomm, info, req));
130       }
131       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
132       PetscCallMPI(MPI_Info_free(&info));
133     }
134   }
135   PetscFunctionReturn(PETSC_SUCCESS);
136 }
137 
138 // Start MPI requests. If use non-GPU aware MPI, we might need to copy data from device buf to host buf
139 static PetscErrorCode PetscSFLinkStartCommunication_Persistent_Neighbor(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
140 {
141   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
142   MPI_Request      *req = NULL;
143 
144   PetscFunctionBegin;
145   if (direction == PETSCSF_ROOT2LEAF) {
146     PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host before sending */));
147   } else {
148     PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_TRUE /* device2host */));
149   }
150 
151   PetscCall(PetscSFLinkGetMPIBuffersAndRequests(sf, link, direction, NULL, NULL, &req, NULL));
152   PetscCall(PetscSFLinkSyncStreamBeforeCallMPI(sf, link, direction));
153   if (dat->rootdegree || dat->leafdegree) {
154     PetscCallMPI(MPI_Start(req));
155     if (direction == PETSCSF_ROOT2LEAF) {
156       PetscCall(PetscLogMPIMessages(dat->rootdegree, dat->rootcounts, link->unit, dat->leafdegree, dat->leafcounts, link->unit));
157     } else {
158       PetscCall(PetscLogMPIMessages(dat->leafdegree, dat->leafcounts, link->unit, dat->rootdegree, dat->rootcounts, link->unit));
159     }
160   }
161   PetscFunctionReturn(PETSC_SUCCESS);
162 }
163 #endif
164 
165 static PetscErrorCode PetscSFSetCommunicationOps_Neighbor(PetscSF sf, PetscSFLink link)
166 {
167   PetscFunctionBegin;
168 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
169   if (sf->persistent) {
170     link->InitMPIRequests    = PetscSFLinkInitMPIRequests_Persistent_Neighbor;
171     link->StartCommunication = PetscSFLinkStartCommunication_Persistent_Neighbor;
172   } else
173 #endif
174   {
175     link->StartCommunication = PetscSFLinkStartCommunication_Neighbor;
176   }
177   PetscFunctionReturn(PETSC_SUCCESS);
178 }
179 
180 /*===================================================================================*/
181 /*              Implementations of SF public APIs                                    */
182 /*===================================================================================*/
183 static PetscErrorCode PetscSFSetUp_Neighbor(PetscSF sf)
184 {
185   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
186   PetscInt          i, j, nrootranks, ndrootranks, nleafranks, ndleafranks;
187   const PetscInt   *rootoffset, *leafoffset;
188   PetscMPIInt       m, n;
189 
190   PetscFunctionBegin;
191   /* SFNeighbor inherits from Basic */
192   PetscCall(PetscSFSetUp_Basic(sf));
193   /* SFNeighbor specific */
194   PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
195   PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
196   dat->rootdegree = m = (PetscMPIInt)(nrootranks - ndrootranks);
197   dat->leafdegree = n = (PetscMPIInt)(nleafranks - ndleafranks);
198   sf->nleafreqs       = 0;
199   dat->nrootreqs      = 1; // collectives only need one MPI_Request. We just put it in rootreqs[]
200 
201   /* Only setup MPI displs/counts for non-distinguished ranks. Distinguished ranks use shared memory */
202   PetscCall(PetscMalloc6(m, &dat->rootdispls, m, &dat->rootcounts, m, &dat->rootweights, n, &dat->leafdispls, n, &dat->leafcounts, n, &dat->leafweights));
203 
204 #if defined(PETSC_HAVE_MPI_LARGE_COUNT) && defined(PETSC_USE_64BIT_INDICES)
205   for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
206     dat->rootdispls[j]  = rootoffset[i] - rootoffset[ndrootranks];
207     dat->rootcounts[j]  = rootoffset[i + 1] - rootoffset[i];
208     dat->rootweights[j] = (PetscMPIInt)((PetscReal)dat->rootcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647); /* Scale to range of PetscMPIInt */
209   }
210 
211   for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
212     dat->leafdispls[j]  = leafoffset[i] - leafoffset[ndleafranks];
213     dat->leafcounts[j]  = leafoffset[i + 1] - leafoffset[i];
214     dat->leafweights[j] = (PetscMPIInt)((PetscReal)dat->leafcounts[j] / (PetscReal)PETSC_MAX_INT * 2147483647);
215   }
216 #else
217   for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
218     PetscCall(PetscMPIIntCast(rootoffset[i] - rootoffset[ndrootranks], &m));
219     dat->rootdispls[j] = m;
220     PetscCall(PetscMPIIntCast(rootoffset[i + 1] - rootoffset[i], &n));
221     dat->rootcounts[j]  = n;
222     dat->rootweights[j] = n;
223   }
224 
225   for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
226     PetscCall(PetscMPIIntCast(leafoffset[i] - leafoffset[ndleafranks], &m));
227     dat->leafdispls[j] = m;
228     PetscCall(PetscMPIIntCast(leafoffset[i + 1] - leafoffset[i], &n));
229     dat->leafcounts[j]  = n;
230     dat->leafweights[j] = n;
231   }
232 #endif
233   PetscFunctionReturn(PETSC_SUCCESS);
234 }
235 
236 static PetscErrorCode PetscSFReset_Neighbor(PetscSF sf)
237 {
238   PetscInt          i;
239   PetscSF_Neighbor *dat = (PetscSF_Neighbor *)sf->data;
240 
241   PetscFunctionBegin;
242   PetscCheck(!dat->inuse, PetscObjectComm((PetscObject)sf), PETSC_ERR_ARG_WRONGSTATE, "Outstanding operation has not been completed");
243   PetscCall(PetscFree6(dat->rootdispls, dat->rootcounts, dat->rootweights, dat->leafdispls, dat->leafcounts, dat->leafweights));
244   for (i = 0; i < 2; i++) {
245     if (dat->initialized[i]) {
246       PetscCallMPI(MPI_Comm_free(&dat->comms[i]));
247       dat->initialized[i] = PETSC_FALSE;
248     }
249   }
250   PetscCall(PetscSFReset_Basic(sf)); /* Common part */
251   PetscFunctionReturn(PETSC_SUCCESS);
252 }
253 
254 static PetscErrorCode PetscSFDestroy_Neighbor(PetscSF sf)
255 {
256   PetscFunctionBegin;
257   PetscCall(PetscSFReset_Neighbor(sf));
258   PetscCall(PetscFree(sf->data));
259   PetscFunctionReturn(PETSC_SUCCESS);
260 }
261 
262 PETSC_INTERN PetscErrorCode PetscSFCreate_Neighbor(PetscSF sf)
263 {
264   PetscSF_Neighbor *dat;
265 
266   PetscFunctionBegin;
267   sf->ops->CreateEmbeddedRootSF = PetscSFCreateEmbeddedRootSF_Basic;
268   sf->ops->BcastBegin           = PetscSFBcastBegin_Basic;
269   sf->ops->BcastEnd             = PetscSFBcastEnd_Basic;
270   sf->ops->ReduceBegin          = PetscSFReduceBegin_Basic;
271   sf->ops->ReduceEnd            = PetscSFReduceEnd_Basic;
272   sf->ops->FetchAndOpBegin      = PetscSFFetchAndOpBegin_Basic;
273   sf->ops->FetchAndOpEnd        = PetscSFFetchAndOpEnd_Basic;
274   sf->ops->GetLeafRanks         = PetscSFGetLeafRanks_Basic;
275   sf->ops->View                 = PetscSFView_Basic;
276 
277   sf->ops->SetUp               = PetscSFSetUp_Neighbor;
278   sf->ops->Reset               = PetscSFReset_Neighbor;
279   sf->ops->Destroy             = PetscSFDestroy_Neighbor;
280   sf->ops->SetCommunicationOps = PetscSFSetCommunicationOps_Neighbor;
281 
282 #if defined(PETSC_HAVE_MPI_PERSISTENT_NEIGHBORHOOD_COLLECTIVES)
283   PetscObjectOptionsBegin((PetscObject)sf);
284   PetscCall(PetscOptionsBool("-sf_neighbor_persistent", "Use MPI-4 persistent neighborhood collectives; used along with -sf_type neighbor", "PetscSFCreate", sf->persistent, &sf->persistent, NULL));
285   PetscOptionsEnd();
286 #endif
287   sf->collective = PETSC_TRUE;
288 
289   PetscCall(PetscNew(&dat));
290   sf->data = (void *)dat;
291   PetscFunctionReturn(PETSC_SUCCESS);
292 }
293