xref: /petsc/src/vec/is/sf/interface/sf.c (revision 51ccb202724f7096f70b1178150053ad1fe5b38d)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petscctable.h>
4 
5 #if defined(PETSC_USE_DEBUG)
6 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
7     if (PetscUnlikely(!(sf)->graphset))                              \
8       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %D \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
9   } while (0)
10 #else
11 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
12 #endif
13 
14 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",0};
15 
16 /*@
17    PetscSFCreate - create a star forest communication context
18 
19    Collective
20 
21    Input Arguments:
22 .  comm - communicator on which the star forest will operate
23 
24    Output Arguments:
25 .  sf - new star forest context
26 
27    Options Database Keys:
28 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
29 .  -sf_type window    -Use MPI-3 one-sided window for communication
30 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
31 
32    Level: intermediate
33 
34    Notes:
35    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
36    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
37    SFs are optimized and they have better performance than general SFs.
38 
39 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
40 @*/
41 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
42 {
43   PetscErrorCode ierr;
44   PetscSF        b;
45 
46   PetscFunctionBegin;
47   PetscValidPointer(sf,2);
48   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
49 
50   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
51 
52   b->nroots    = -1;
53   b->nleaves   = -1;
54   b->minleaf   = PETSC_MAX_INT;
55   b->maxleaf   = PETSC_MIN_INT;
56   b->nranks    = -1;
57   b->rankorder = PETSC_TRUE;
58   b->ingroup   = MPI_GROUP_NULL;
59   b->outgroup  = MPI_GROUP_NULL;
60   b->graphset  = PETSC_FALSE;
61 
62   *sf = b;
63   PetscFunctionReturn(0);
64 }
65 
66 /*@
67    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
68 
69    Collective
70 
71    Input Arguments:
72 .  sf - star forest
73 
74    Level: advanced
75 
76 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
77 @*/
78 PetscErrorCode PetscSFReset(PetscSF sf)
79 {
80   PetscErrorCode ierr;
81 
82   PetscFunctionBegin;
83   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
84   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
85   sf->nroots   = -1;
86   sf->nleaves  = -1;
87   sf->minleaf  = PETSC_MAX_INT;
88   sf->maxleaf  = PETSC_MIN_INT;
89   sf->mine     = NULL;
90   sf->remote   = NULL;
91   sf->graphset = PETSC_FALSE;
92   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
93   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
94   sf->nranks = -1;
95   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
96 #if defined(PETSC_HAVE_CUDA)
97   if (sf->rmine_d) {cudaError_t err = cudaFree(sf->rmine_d);CHKERRCUDA(err);sf->rmine_d=NULL;}
98 #endif
99   sf->degreeknown = PETSC_FALSE;
100   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
101   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRQ(ierr);}
102   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRQ(ierr);}
103   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
104   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
105   sf->setupcalled = PETSC_FALSE;
106   PetscFunctionReturn(0);
107 }
108 
109 /*@C
110    PetscSFSetType - Set the PetscSF communication implementation
111 
112    Collective on PetscSF
113 
114    Input Parameters:
115 +  sf - the PetscSF context
116 -  type - a known method
117 
118    Options Database Key:
119 .  -sf_type <type> - Sets the method; use -help for a list
120    of available methods (for instance, window, pt2pt, neighbor)
121 
122    Notes:
123    See "include/petscsf.h" for available methods (for instance)
124 +    PETSCSFWINDOW - MPI-2/3 one-sided
125 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
126 
127   Level: intermediate
128 
129 .seealso: PetscSFType, PetscSFCreate()
130 @*/
131 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
132 {
133   PetscErrorCode ierr,(*r)(PetscSF);
134   PetscBool      match;
135 
136   PetscFunctionBegin;
137   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
138   PetscValidCharPointer(type,2);
139 
140   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
141   if (match) PetscFunctionReturn(0);
142 
143   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
144   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
145   /* Destroy the previous PetscSF implementation context */
146   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
147   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
148   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
149   ierr = (*r)(sf);CHKERRQ(ierr);
150   PetscFunctionReturn(0);
151 }
152 
153 /*@C
154   PetscSFGetType - Get the PetscSF communication implementation
155 
156   Not Collective
157 
158   Input Parameter:
159 . sf  - the PetscSF context
160 
161   Output Parameter:
162 . type - the PetscSF type name
163 
164   Level: intermediate
165 
166 .seealso: PetscSFSetType(), PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
169 {
170   PetscFunctionBegin;
171   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
172   PetscValidPointer(type,2);
173   *type = ((PetscObject)sf)->type_name;
174   PetscFunctionReturn(0);
175 }
176 
177 /*@
178    PetscSFDestroy - destroy star forest
179 
180    Collective
181 
182    Input Arguments:
183 .  sf - address of star forest
184 
185    Level: intermediate
186 
187 .seealso: PetscSFCreate(), PetscSFReset()
188 @*/
189 PetscErrorCode PetscSFDestroy(PetscSF *sf)
190 {
191   PetscErrorCode ierr;
192 
193   PetscFunctionBegin;
194   if (!*sf) PetscFunctionReturn(0);
195   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
196   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
197   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
198   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
199   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
200   PetscFunctionReturn(0);
201 }
202 
203 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
204 {
205 #if defined(PETSC_USE_DEBUG)
206   PetscInt           i, nleaves;
207   PetscMPIInt        size;
208   const PetscInt    *ilocal;
209   const PetscSFNode *iremote;
210   PetscErrorCode     ierr;
211 
212   PetscFunctionBegin;
213   if (!sf->graphset) PetscFunctionReturn(0);
214   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
215   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
216   for (i = 0; i < nleaves; i++) {
217     const PetscInt rank = iremote[i].rank;
218     const PetscInt remote = iremote[i].index;
219     const PetscInt leaf = ilocal ? ilocal[i] : i;
220     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%D) for remote %D is invalid, should be in [0, %d)",rank,i,size);
221     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%D) for remote %D is invalid, should be >= 0",remote,i);
222     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%D) for leaf %D is invalid, should be >= 0",leaf,i);
223   }
224   PetscFunctionReturn(0);
225 #else
226   PetscFunctionBegin;
227   PetscFunctionReturn(0);
228 #endif
229 }
230 
231 /*@
232    PetscSFSetUp - set up communication structures
233 
234    Collective
235 
236    Input Arguments:
237 .  sf - star forest communication object
238 
239    Level: beginner
240 
241 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
242 @*/
243 PetscErrorCode PetscSFSetUp(PetscSF sf)
244 {
245   PetscErrorCode ierr;
246 
247   PetscFunctionBegin;
248   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
249   PetscSFCheckGraphSet(sf,1);
250   if (sf->setupcalled) PetscFunctionReturn(0);
251   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
252   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);}
253   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
254   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
255   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
256   sf->setupcalled = PETSC_TRUE;
257   PetscFunctionReturn(0);
258 }
259 
260 /*@
261    PetscSFSetFromOptions - set PetscSF options using the options database
262 
263    Logically Collective
264 
265    Input Arguments:
266 .  sf - star forest
267 
268    Options Database Keys:
269 +  -sf_type              - implementation type, see PetscSFSetType()
270 .  -sf_rank_order        - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
271 -  -sf_use_pinned_buffer - use pinned (nonpagable) memory for send/recv buffers on host when communicating GPU data but GPU-aware MPI is not used.
272                            Only available for SF types of basic and neighbor.
273 
274    Level: intermediate
275 
276 .seealso: PetscSFWindowSetSyncType()
277 @*/
278 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
279 {
280   PetscSFType    deft;
281   char           type[256];
282   PetscErrorCode ierr;
283   PetscBool      flg;
284 
285   PetscFunctionBegin;
286   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
287   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
288   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
289   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
290   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
291   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
292   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
293   ierr = PetscOptionsEnd();CHKERRQ(ierr);
294   PetscFunctionReturn(0);
295 }
296 
297 /*@
298    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
299 
300    Logically Collective
301 
302    Input Arguments:
303 +  sf - star forest
304 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
305 
306    Level: advanced
307 
308 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
309 @*/
310 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
311 {
312 
313   PetscFunctionBegin;
314   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
315   PetscValidLogicalCollectiveBool(sf,flg,2);
316   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
317   sf->rankorder = flg;
318   PetscFunctionReturn(0);
319 }
320 
321 /*@
322    PetscSFSetGraph - Set a parallel star forest
323 
324    Collective
325 
326    Input Arguments:
327 +  sf - star forest
328 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
329 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
330 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
331 during setup in debug mode)
332 .  localmode - copy mode for ilocal
333 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
334 during setup in debug mode)
335 -  remotemode - copy mode for iremote
336 
337    Level: intermediate
338 
339    Notes:
340     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
341 
342    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
343    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
344    needed
345 
346    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
347    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
348 
349 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
350 @*/
351 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
352 {
353   PetscErrorCode ierr;
354 
355   PetscFunctionBegin;
356   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
357   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
358   if (nleaves > 0) PetscValidPointer(iremote,6);
359   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %D, cannot be negative",nroots);
360   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %D, cannot be negative",nleaves);
361 
362   ierr = PetscSFReset(sf);CHKERRQ(ierr);
363   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
364 
365   sf->nroots  = nroots;
366   sf->nleaves = nleaves;
367 
368   if (nleaves && ilocal) {
369     PetscInt i;
370     PetscInt minleaf = PETSC_MAX_INT;
371     PetscInt maxleaf = PETSC_MIN_INT;
372     int      contiguous = 1;
373     for (i=0; i<nleaves; i++) {
374       minleaf = PetscMin(minleaf,ilocal[i]);
375       maxleaf = PetscMax(maxleaf,ilocal[i]);
376       contiguous &= (ilocal[i] == i);
377     }
378     sf->minleaf = minleaf;
379     sf->maxleaf = maxleaf;
380     if (contiguous) {
381       if (localmode == PETSC_OWN_POINTER) {
382         ierr = PetscFree(ilocal);CHKERRQ(ierr);
383       }
384       ilocal = NULL;
385     }
386   } else {
387     sf->minleaf = 0;
388     sf->maxleaf = nleaves - 1;
389   }
390 
391   if (ilocal) {
392     switch (localmode) {
393     case PETSC_COPY_VALUES:
394       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
395       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
396       sf->mine = sf->mine_alloc;
397       break;
398     case PETSC_OWN_POINTER:
399       sf->mine_alloc = (PetscInt*)ilocal;
400       sf->mine       = sf->mine_alloc;
401       break;
402     case PETSC_USE_POINTER:
403       sf->mine_alloc = NULL;
404       sf->mine       = (PetscInt*)ilocal;
405       break;
406     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
407     }
408   }
409 
410   switch (remotemode) {
411   case PETSC_COPY_VALUES:
412     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
413     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
414     sf->remote = sf->remote_alloc;
415     break;
416   case PETSC_OWN_POINTER:
417     sf->remote_alloc = (PetscSFNode*)iremote;
418     sf->remote       = sf->remote_alloc;
419     break;
420   case PETSC_USE_POINTER:
421     sf->remote_alloc = NULL;
422     sf->remote       = (PetscSFNode*)iremote;
423     break;
424   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
425   }
426 
427   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
428   sf->graphset = PETSC_TRUE;
429   PetscFunctionReturn(0);
430 }
431 
432 /*@
433   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
434 
435   Collective
436 
437   Input Parameters:
438 + sf      - The PetscSF
439 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
440 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
441 
442   Notes:
443   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
444   n and N are local and global sizes of x respectively.
445 
446   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
447   sequential vectors y on all ranks.
448 
449   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
450   sequential vector y on rank 0.
451 
452   In above cases, entries of x are roots and entries of y are leaves.
453 
454   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
455   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
456   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
457   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
458   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
459 
460   In this case, roots and leaves are symmetric.
461 
462   Level: intermediate
463  @*/
464 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
465 {
466   MPI_Comm       comm;
467   PetscInt       n,N,res[2];
468   PetscMPIInt    rank,size;
469   PetscSFType    type;
470   PetscErrorCode ierr;
471 
472   PetscFunctionBegin;
473   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
474   if (pattern < PETSCSF_PATTERN_ALLGATHER || pattern > PETSCSF_PATTERN_ALLTOALL) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %D\n",pattern);
475   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
476   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
477 
478   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
479     type = PETSCSFALLTOALL;
480     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
481     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
482     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
483     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
484   } else {
485     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
486     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
487     res[0] = n;
488     res[1] = -n;
489     /* Check if n are same over all ranks so that we can optimize it */
490     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr);
491     if (res[0] == -res[1]) { /* same n */
492       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
493     } else {
494       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
495     }
496     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
497   }
498   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
499 
500   sf->pattern = pattern;
501   sf->mine    = NULL; /* Contiguous */
502 
503   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
504      Also set other easy stuff.
505    */
506   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
507     sf->nleaves      = N;
508     sf->nroots       = n;
509     sf->nranks       = size;
510     sf->minleaf      = 0;
511     sf->maxleaf      = N - 1;
512   } else if (pattern == PETSCSF_PATTERN_GATHER) {
513     sf->nleaves      = rank ? 0 : N;
514     sf->nroots       = n;
515     sf->nranks       = rank ? 0 : size;
516     sf->minleaf      = 0;
517     sf->maxleaf      = rank ? -1 : N - 1;
518   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
519     sf->nleaves      = size;
520     sf->nroots       = size;
521     sf->nranks       = size;
522     sf->minleaf      = 0;
523     sf->maxleaf      = size - 1;
524   }
525   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
526   sf->graphset = PETSC_TRUE;
527   PetscFunctionReturn(0);
528 }
529 
530 /*@
531    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
532 
533    Collective
534 
535    Input Arguments:
536 
537 .  sf - star forest to invert
538 
539    Output Arguments:
540 .  isf - inverse of sf
541    Level: advanced
542 
543    Notes:
544    All roots must have degree 1.
545 
546    The local space may be a permutation, but cannot be sparse.
547 
548 .seealso: PetscSFSetGraph()
549 @*/
550 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
551 {
552   PetscErrorCode ierr;
553   PetscMPIInt    rank;
554   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
555   const PetscInt *ilocal;
556   PetscSFNode    *roots,*leaves;
557 
558   PetscFunctionBegin;
559   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
560   PetscSFCheckGraphSet(sf,1);
561   PetscValidPointer(isf,2);
562 
563   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
564   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
565 
566   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
567   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
568   for (i=0; i<maxlocal; i++) {
569     leaves[i].rank  = rank;
570     leaves[i].index = i;
571   }
572   for (i=0; i <nroots; i++) {
573     roots[i].rank  = -1;
574     roots[i].index = -1;
575   }
576   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
577   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPIU_REPLACE);CHKERRQ(ierr);
578 
579   /* Check whether our leaves are sparse */
580   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
581   if (count == nroots) newilocal = NULL;
582   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
583     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
584     for (i=0,count=0; i<nroots; i++) {
585       if (roots[i].rank >= 0) {
586         newilocal[count]   = i;
587         roots[count].rank  = roots[i].rank;
588         roots[count].index = roots[i].index;
589         count++;
590       }
591     }
592   }
593 
594   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
595   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
596   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
597   PetscFunctionReturn(0);
598 }
599 
600 /*@
601    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
602 
603    Collective
604 
605    Input Arguments:
606 +  sf - communication object to duplicate
607 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
608 
609    Output Arguments:
610 .  newsf - new communication object
611 
612    Level: beginner
613 
614 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
615 @*/
616 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
617 {
618   PetscSFType    type;
619   PetscErrorCode ierr;
620 
621   PetscFunctionBegin;
622   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
623   PetscValidLogicalCollectiveEnum(sf,opt,2);
624   PetscValidPointer(newsf,3);
625   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
626   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
627   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
628   if (opt == PETSCSF_DUPLICATE_GRAPH) {
629     PetscSFCheckGraphSet(sf,1);
630     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
631       PetscInt          nroots,nleaves;
632       const PetscInt    *ilocal;
633       const PetscSFNode *iremote;
634       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
635       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
636     } else {
637       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
638     }
639   }
640   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
641   PetscFunctionReturn(0);
642 }
643 
644 /*@C
645    PetscSFGetGraph - Get the graph specifying a parallel star forest
646 
647    Not Collective
648 
649    Input Arguments:
650 .  sf - star forest
651 
652    Output Arguments:
653 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
654 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
655 .  ilocal - locations of leaves in leafdata buffers
656 -  iremote - remote locations of root vertices for each leaf on the current process
657 
658    Notes:
659    We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
660 
661    Level: intermediate
662 
663 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
664 @*/
665 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
666 {
667   PetscErrorCode ierr;
668 
669   PetscFunctionBegin;
670   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
671   if (sf->ops->GetGraph) {
672     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
673   } else {
674     if (nroots) *nroots = sf->nroots;
675     if (nleaves) *nleaves = sf->nleaves;
676     if (ilocal) *ilocal = sf->mine;
677     if (iremote) *iremote = sf->remote;
678   }
679   PetscFunctionReturn(0);
680 }
681 
682 /*@
683    PetscSFGetLeafRange - Get the active leaf ranges
684 
685    Not Collective
686 
687    Input Arguments:
688 .  sf - star forest
689 
690    Output Arguments:
691 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
692 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
693 
694    Level: developer
695 
696 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
697 @*/
698 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
699 {
700 
701   PetscFunctionBegin;
702   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
703   PetscSFCheckGraphSet(sf,1);
704   if (minleaf) *minleaf = sf->minleaf;
705   if (maxleaf) *maxleaf = sf->maxleaf;
706   PetscFunctionReturn(0);
707 }
708 
709 /*@C
710    PetscSFView - view a star forest
711 
712    Collective
713 
714    Input Arguments:
715 +  sf - star forest
716 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
717 
718    Level: beginner
719 
720 .seealso: PetscSFCreate(), PetscSFSetGraph()
721 @*/
722 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
723 {
724   PetscErrorCode    ierr;
725   PetscBool         iascii;
726   PetscViewerFormat format;
727 
728   PetscFunctionBegin;
729   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
730   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
731   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
732   PetscCheckSameComm(sf,1,viewer,2);
733   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
734   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
735   if (iascii) {
736     PetscMPIInt rank;
737     PetscInt    ii,i,j;
738 
739     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
740     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
741     if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
742     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
743       if (!sf->graphset) {
744         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
745         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
746         PetscFunctionReturn(0);
747       }
748       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
749       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
750       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%D, leaves=%D, remote ranks=%D\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
751       for (i=0; i<sf->nleaves; i++) {
752         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D <- (%D,%D)\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
753       }
754       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
755       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
756       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
757         PetscMPIInt *tmpranks,*perm;
758         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
759         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
760         for (i=0; i<sf->nranks; i++) perm[i] = i;
761         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
762         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
763         for (ii=0; ii<sf->nranks; ii++) {
764           i = perm[ii];
765           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %D edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
766           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
767             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %D <- %D\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
768           }
769         }
770         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
771       }
772       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
773       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
774     }
775     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
776   }
777   PetscFunctionReturn(0);
778 }
779 
780 /*@C
781    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
782 
783    Not Collective
784 
785    Input Arguments:
786 .  sf - star forest
787 
788    Output Arguments:
789 +  nranks - number of ranks referenced by local part
790 .  ranks - array of ranks
791 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
792 .  rmine - concatenated array holding local indices referencing each remote rank
793 -  rremote - concatenated array holding remote indices referenced for each remote rank
794 
795    Level: developer
796 
797 .seealso: PetscSFGetLeafRanks()
798 @*/
799 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
800 {
801   PetscErrorCode ierr;
802 
803   PetscFunctionBegin;
804   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
805   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
806   if (sf->ops->GetRootRanks) {
807     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
808   } else {
809     /* The generic implementation */
810     if (nranks)  *nranks  = sf->nranks;
811     if (ranks)   *ranks   = sf->ranks;
812     if (roffset) *roffset = sf->roffset;
813     if (rmine)   *rmine   = sf->rmine;
814     if (rremote) *rremote = sf->rremote;
815   }
816   PetscFunctionReturn(0);
817 }
818 
819 /*@C
820    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
821 
822    Not Collective
823 
824    Input Arguments:
825 .  sf - star forest
826 
827    Output Arguments:
828 +  niranks - number of leaf ranks referencing roots on this process
829 .  iranks - array of ranks
830 .  ioffset - offset in irootloc for each rank (length niranks+1)
831 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
832 
833    Level: developer
834 
835 .seealso: PetscSFGetRootRanks()
836 @*/
837 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
838 {
839   PetscErrorCode ierr;
840 
841   PetscFunctionBegin;
842   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
843   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
844   if (sf->ops->GetLeafRanks) {
845     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
846   } else {
847     PetscSFType type;
848     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
849     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
850   }
851   PetscFunctionReturn(0);
852 }
853 
854 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
855   PetscInt i;
856   for (i=0; i<n; i++) {
857     if (needle == list[i]) return PETSC_TRUE;
858   }
859   return PETSC_FALSE;
860 }
861 
862 /*@C
863    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
864 
865    Collective
866 
867    Input Arguments:
868 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
869 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
870 
871    Level: developer
872 
873 .seealso: PetscSFGetRootRanks()
874 @*/
875 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
876 {
877   PetscErrorCode     ierr;
878   PetscTable         table;
879   PetscTablePosition pos;
880   PetscMPIInt        size,groupsize,*groupranks;
881   PetscInt           *rcount,*ranks;
882   PetscInt           i, irank = -1,orank = -1;
883 
884   PetscFunctionBegin;
885   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
886   PetscSFCheckGraphSet(sf,1);
887   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRQ(ierr);
888   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
889   for (i=0; i<sf->nleaves; i++) {
890     /* Log 1-based rank */
891     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
892   }
893   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
894   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
895   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
896   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
897   for (i=0; i<sf->nranks; i++) {
898     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
899     ranks[i]--;             /* Convert back to 0-based */
900   }
901   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
902 
903   /* We expect that dgroup is reliably "small" while nranks could be large */
904   {
905     MPI_Group group = MPI_GROUP_NULL;
906     PetscMPIInt *dgroupranks;
907     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
908     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRQ(ierr);
909     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
910     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
911     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
912     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRQ(ierr);}
913     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
914     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
915   }
916 
917   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
918   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i; ) {
919     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
920       if (InList(ranks[i],groupsize,groupranks)) break;
921     }
922     for ( ; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
923       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
924     }
925     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
926       PetscInt    tmprank,tmpcount;
927 
928       tmprank             = ranks[i];
929       tmpcount            = rcount[i];
930       ranks[i]            = ranks[sf->ndranks];
931       rcount[i]           = rcount[sf->ndranks];
932       ranks[sf->ndranks]  = tmprank;
933       rcount[sf->ndranks] = tmpcount;
934       sf->ndranks++;
935     }
936   }
937   ierr = PetscFree(groupranks);CHKERRQ(ierr);
938   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
939   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
940   sf->roffset[0] = 0;
941   for (i=0; i<sf->nranks; i++) {
942     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
943     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
944     rcount[i]        = 0;
945   }
946   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
947     /* short circuit */
948     if (orank != sf->remote[i].rank) {
949       /* Search for index of iremote[i].rank in sf->ranks */
950       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
951       if (irank < 0) {
952         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
953         if (irank >= 0) irank += sf->ndranks;
954       }
955       orank = sf->remote[i].rank;
956     }
957     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %D in array",sf->remote[i].rank);
958     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
959     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
960     rcount[irank]++;
961   }
962   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
963   PetscFunctionReturn(0);
964 }
965 
966 /*@C
967    PetscSFGetGroups - gets incoming and outgoing process groups
968 
969    Collective
970 
971    Input Argument:
972 .  sf - star forest
973 
974    Output Arguments:
975 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
976 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
977 
978    Level: developer
979 
980 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
981 @*/
982 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
983 {
984   PetscErrorCode ierr;
985   MPI_Group      group = MPI_GROUP_NULL;
986 
987   PetscFunctionBegin;
988   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining groups");
989   if (sf->ingroup == MPI_GROUP_NULL) {
990     PetscInt       i;
991     const PetscInt *indegree;
992     PetscMPIInt    rank,*outranks,*inranks;
993     PetscSFNode    *remote;
994     PetscSF        bgcount;
995 
996     /* Compute the number of incoming ranks */
997     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
998     for (i=0; i<sf->nranks; i++) {
999       remote[i].rank  = sf->ranks[i];
1000       remote[i].index = 0;
1001     }
1002     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1003     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1004     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1005     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1006 
1007     /* Enumerate the incoming ranks */
1008     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1009     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1010     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1011     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1012     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1013     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1014     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRQ(ierr);
1015     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1016     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1017     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1018   }
1019   *incoming = sf->ingroup;
1020 
1021   if (sf->outgroup == MPI_GROUP_NULL) {
1022     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRQ(ierr);
1023     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRQ(ierr);
1024     ierr = MPI_Group_free(&group);CHKERRQ(ierr);
1025   }
1026   *outgoing = sf->outgroup;
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 /*@
1031    PetscSFGetMultiSF - gets the inner SF implemeting gathers and scatters
1032 
1033    Collective
1034 
1035    Input Argument:
1036 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1037 
1038    Output Arguments:
1039 .  multi - star forest with split roots, such that each root has degree exactly 1
1040 
1041    Level: developer
1042 
1043    Notes:
1044 
1045    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1046    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1047    edge, it is a candidate for future optimization that might involve its removal.
1048 
1049 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1050 @*/
1051 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1052 {
1053   PetscErrorCode ierr;
1054 
1055   PetscFunctionBegin;
1056   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1057   PetscValidPointer(multi,2);
1058   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1059     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1060     *multi = sf->multi;
1061     PetscFunctionReturn(0);
1062   }
1063   if (!sf->multi) {
1064     const PetscInt *indegree;
1065     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1066     PetscSFNode    *remote;
1067     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1068     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1069     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1070     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1071     inoffset[0] = 0;
1072     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1073     for (i=0; i<maxlocal; i++) outones[i] = 1;
1074     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1075     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1076     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1077 #if 0
1078 #if defined(PETSC_USE_DEBUG)                                 /* Check that the expected number of increments occurred */
1079     for (i=0; i<sf->nroots; i++) {
1080       if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1081     }
1082 #endif
1083 #endif
1084     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1085     for (i=0; i<sf->nleaves; i++) {
1086       remote[i].rank  = sf->remote[i].rank;
1087       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1088     }
1089     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1090     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1091     if (sf->rankorder) {        /* Sort the ranks */
1092       PetscMPIInt rank;
1093       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1094       PetscSFNode *newremote;
1095       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRQ(ierr);
1096       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1097       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1098       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1099       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1100       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPIU_REPLACE);CHKERRQ(ierr);
1101       /* Sort the incoming ranks at each vertex, build the inverse map */
1102       for (i=0; i<sf->nroots; i++) {
1103         PetscInt j;
1104         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1105         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1106         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1107       }
1108       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1109       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset);CHKERRQ(ierr);
1110       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1111       for (i=0; i<sf->nleaves; i++) {
1112         newremote[i].rank  = sf->remote[i].rank;
1113         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1114       }
1115       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1116       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1117     }
1118     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1119   }
1120   *multi = sf->multi;
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 /*@C
1125    PetscSFCreateEmbeddedSF - removes edges from all but the selected roots, does not remap indices
1126 
1127    Collective
1128 
1129    Input Arguments:
1130 +  sf - original star forest
1131 .  nselected  - number of selected roots on this process
1132 -  selected   - indices of the selected roots on this process
1133 
1134    Output Arguments:
1135 .  newsf - new star forest
1136 
1137    Level: advanced
1138 
1139    Note:
1140    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1141    be done by calling PetscSFGetGraph().
1142 
1143 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1144 @*/
1145 PetscErrorCode PetscSFCreateEmbeddedSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1146 {
1147   PetscInt          i,n,*roots,*rootdata,*leafdata,nroots,nleaves,connected_leaves,*new_ilocal;
1148   const PetscSFNode *iremote;
1149   PetscSFNode       *new_iremote;
1150   PetscSF           tmpsf;
1151   MPI_Comm          comm;
1152   PetscErrorCode    ierr;
1153 
1154   PetscFunctionBegin;
1155   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1156   PetscSFCheckGraphSet(sf,1);
1157   if (nselected) PetscValidPointer(selected,3);
1158   PetscValidPointer(newsf,4);
1159 
1160   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1161 
1162   /* Uniq selected[] and put results in roots[] */
1163   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1164   ierr = PetscMalloc1(nselected,&roots);CHKERRQ(ierr);
1165   ierr = PetscArraycpy(roots,selected,nselected);CHKERRQ(ierr);
1166   ierr = PetscSortedRemoveDupsInt(&nselected,roots);CHKERRQ(ierr);
1167   if (nselected && (roots[0] < 0 || roots[nselected-1] >= sf->nroots)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max root indices %D/%D are not in [0,%D)",roots[0],roots[nselected-1],sf->nroots);
1168 
1169   if (sf->ops->CreateEmbeddedSF) {
1170     ierr = (*sf->ops->CreateEmbeddedSF)(sf,nselected,roots,newsf);CHKERRQ(ierr);
1171   } else {
1172     /* A generic version of creating embedded sf. Note that we called PetscSFSetGraph() twice, which is certainly expensive */
1173     /* Find out which leaves (not leaf data items) are still connected to roots in the embedded sf */
1174     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,NULL,&iremote);CHKERRQ(ierr);
1175     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&tmpsf);CHKERRQ(ierr);
1176     ierr = PetscSFSetGraph(tmpsf,nroots,nleaves,NULL/*contiguous*/,PETSC_USE_POINTER,iremote,PETSC_USE_POINTER);CHKERRQ(ierr);
1177     ierr = PetscCalloc2(nroots,&rootdata,nleaves,&leafdata);CHKERRQ(ierr);
1178     for (i=0; i<nselected; ++i) rootdata[roots[i]] = 1;
1179     ierr = PetscSFBcastBegin(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1180     ierr = PetscSFBcastEnd(tmpsf,MPIU_INT,rootdata,leafdata);CHKERRQ(ierr);
1181     ierr = PetscSFDestroy(&tmpsf);CHKERRQ(ierr);
1182 
1183     /* Build newsf with leaves that are still connected */
1184     connected_leaves = 0;
1185     for (i=0; i<nleaves; ++i) connected_leaves += leafdata[i];
1186     ierr = PetscMalloc1(connected_leaves,&new_ilocal);CHKERRQ(ierr);
1187     ierr = PetscMalloc1(connected_leaves,&new_iremote);CHKERRQ(ierr);
1188     for (i=0, n=0; i<nleaves; ++i) {
1189       if (leafdata[i]) {
1190         new_ilocal[n]        = sf->mine ? sf->mine[i] : i;
1191         new_iremote[n].rank  = sf->remote[i].rank;
1192         new_iremote[n].index = sf->remote[i].index;
1193         ++n;
1194       }
1195     }
1196 
1197     if (n != connected_leaves) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"There is a size mismatch in the SF embedding, %d != %d",n,connected_leaves);
1198     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1199     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1200     ierr = PetscSFSetGraph(*newsf,nroots,connected_leaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1201     ierr = PetscFree2(rootdata,leafdata);CHKERRQ(ierr);
1202   }
1203   ierr = PetscFree(roots);CHKERRQ(ierr);
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 /*@C
1208   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1209 
1210   Collective
1211 
1212   Input Arguments:
1213 + sf - original star forest
1214 . nselected  - number of selected leaves on this process
1215 - selected   - indices of the selected leaves on this process
1216 
1217   Output Arguments:
1218 .  newsf - new star forest
1219 
1220   Level: advanced
1221 
1222 .seealso: PetscSFCreateEmbeddedSF(), PetscSFSetGraph(), PetscSFGetGraph()
1223 @*/
1224 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1225 {
1226   const PetscSFNode *iremote;
1227   PetscSFNode       *new_iremote;
1228   const PetscInt    *ilocal;
1229   PetscInt          i,nroots,*leaves,*new_ilocal;
1230   MPI_Comm          comm;
1231   PetscErrorCode    ierr;
1232 
1233   PetscFunctionBegin;
1234   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1235   PetscSFCheckGraphSet(sf,1);
1236   if (nselected) PetscValidPointer(selected,3);
1237   PetscValidPointer(newsf,4);
1238 
1239   /* Uniq selected[] and put results in leaves[] */
1240   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1241   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1242   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1243   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1244   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %D/%D are not in [0,%D)",leaves[0],leaves[nselected-1],sf->nleaves);
1245 
1246   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1247   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1248     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1249   } else {
1250     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1251     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1252     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1253     for (i=0; i<nselected; ++i) {
1254       const PetscInt l     = leaves[i];
1255       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1256       new_iremote[i].rank  = iremote[l].rank;
1257       new_iremote[i].index = iremote[l].index;
1258     }
1259     ierr = PetscSFCreate(comm,newsf);CHKERRQ(ierr);
1260     ierr = PetscSFSetFromOptions(*newsf);CHKERRQ(ierr);
1261     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1262   }
1263   ierr = PetscFree(leaves);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 /*@C
1268    PetscSFBcastAndOpBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastAndOpEnd()
1269 
1270    Collective on PetscSF
1271 
1272    Input Arguments:
1273 +  sf - star forest on which to communicate
1274 .  unit - data type associated with each node
1275 .  rootdata - buffer to broadcast
1276 -  op - operation to use for reduction
1277 
1278    Output Arguments:
1279 .  leafdata - buffer to be reduced with values from each leaf's respective root
1280 
1281    Level: intermediate
1282 
1283 .seealso: PetscSFBcastAndOpEnd(), PetscSFBcastBegin(), PetscSFBcastEnd()
1284 @*/
1285 PetscErrorCode PetscSFBcastAndOpBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1286 {
1287   PetscErrorCode ierr;
1288   PetscMemType   rootmtype,leafmtype;
1289 
1290   PetscFunctionBegin;
1291   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1292   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1293   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1294   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1295   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1296 #if defined(PETSC_HAVE_CUDA)
1297   /*  Shall we assume rootdata, leafdata are ready to use (instead of being computed by some asynchronous kernels)?
1298     To be similar to MPI, I'd like to have this assumption, since MPI does not have a concept of stream.
1299     But currently this assumption is not enforecd in Petsc. To be safe, I do synchronization here. Otherwise, if
1300     we do not sync now and call the Pack kernel directly on the default NULL stream (assume petsc objects are also
1301     computed on it), we have to sync the NULL stream before calling MPI routines. So, it looks a cudaDeviceSynchronize
1302     is inevitable. We do it now and put pack/unpack kernels to non-NULL streams.
1303    */
1304   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1305 #endif
1306   ierr = (*sf->ops->BcastAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1307   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 /*@C
1312    PetscSFBcastAndOpEnd - end a broadcast & reduce operation started with PetscSFBcastAndOpBegin()
1313 
1314    Collective
1315 
1316    Input Arguments:
1317 +  sf - star forest
1318 .  unit - data type
1319 .  rootdata - buffer to broadcast
1320 -  op - operation to use for reduction
1321 
1322    Output Arguments:
1323 .  leafdata - buffer to be reduced with values from each leaf's respective root
1324 
1325    Level: intermediate
1326 
1327 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1328 @*/
1329 PetscErrorCode PetscSFBcastAndOpEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1330 {
1331   PetscErrorCode ierr;
1332   PetscMemType   rootmtype,leafmtype;
1333 
1334   PetscFunctionBegin;
1335   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1336   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1337   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1338   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1339   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1340   ierr = (*sf->ops->BcastAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1341   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1342   PetscFunctionReturn(0);
1343 }
1344 
1345 /*@C
1346    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1347 
1348    Collective
1349 
1350    Input Arguments:
1351 +  sf - star forest
1352 .  unit - data type
1353 .  leafdata - values to reduce
1354 -  op - reduction operation
1355 
1356    Output Arguments:
1357 .  rootdata - result of reduction of values from all leaves of each root
1358 
1359    Level: intermediate
1360 
1361 .seealso: PetscSFBcastBegin()
1362 @*/
1363 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1364 {
1365   PetscErrorCode ierr;
1366   PetscMemType   rootmtype,leafmtype;
1367 
1368   PetscFunctionBegin;
1369   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1370   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1371   ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1372   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1373   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1374 #if defined(PETSC_HAVE_CUDA)
1375   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1376 #endif
1377   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1378   ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);
1379   PetscFunctionReturn(0);
1380 }
1381 
1382 /*@C
1383    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1384 
1385    Collective
1386 
1387    Input Arguments:
1388 +  sf - star forest
1389 .  unit - data type
1390 .  leafdata - values to reduce
1391 -  op - reduction operation
1392 
1393    Output Arguments:
1394 .  rootdata - result of reduction of values from all leaves of each root
1395 
1396    Level: intermediate
1397 
1398 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1399 @*/
1400 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1401 {
1402   PetscErrorCode ierr;
1403   PetscMemType   rootmtype,leafmtype;
1404 
1405   PetscFunctionBegin;
1406   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1407   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1408   ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1409   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1410   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1411   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1412   ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);
1413   PetscFunctionReturn(0);
1414 }
1415 
1416 /*@C
1417    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1418 
1419    Collective
1420 
1421    Input Arguments:
1422 +  sf - star forest
1423 .  unit - data type
1424 .  leafdata - leaf values to use in reduction
1425 -  op - operation to use for reduction
1426 
1427    Output Arguments:
1428 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1429 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1430 
1431    Level: advanced
1432 
1433    Note:
1434    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1435    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1436    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1437    integers.
1438 
1439 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1440 @*/
1441 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1442 {
1443   PetscErrorCode ierr;
1444   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1445 
1446   PetscFunctionBegin;
1447   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1448   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1449   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1450   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1451   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1452   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1453   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1454 #if defined(PETSC_HAVE_CUDA)
1455   if (rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaDeviceSynchronize();CHKERRCUDA(err);}
1456 #endif
1457   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1458   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1459   PetscFunctionReturn(0);
1460 }
1461 
1462 /*@C
1463    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1464 
1465    Collective
1466 
1467    Input Arguments:
1468 +  sf - star forest
1469 .  unit - data type
1470 .  leafdata - leaf values to use in reduction
1471 -  op - operation to use for reduction
1472 
1473    Output Arguments:
1474 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1475 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1476 
1477    Level: advanced
1478 
1479 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1480 @*/
1481 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1482 {
1483   PetscErrorCode ierr;
1484   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1485 
1486   PetscFunctionBegin;
1487   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1488   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1489   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1490   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1491   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1492   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1493   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1494   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1495   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1496   PetscFunctionReturn(0);
1497 }
1498 
1499 /*@C
1500    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1501 
1502    Collective
1503 
1504    Input Arguments:
1505 .  sf - star forest
1506 
1507    Output Arguments:
1508 .  degree - degree of each root vertex
1509 
1510    Level: advanced
1511 
1512    Notes:
1513    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1514 
1515 .seealso: PetscSFGatherBegin()
1516 @*/
1517 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1518 {
1519   PetscErrorCode ierr;
1520 
1521   PetscFunctionBegin;
1522   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1523   PetscSFCheckGraphSet(sf,1);
1524   PetscValidPointer(degree,2);
1525   if (!sf->degreeknown) {
1526     PetscInt i, nroots = sf->nroots, maxlocal = sf->maxleaf+1;  /* TODO: We should use PetscSFGetLeafRange() */
1527     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1528     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1529     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1530     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1531     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1532     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1533   }
1534   *degree = NULL;
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 /*@C
1539    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1540 
1541    Collective
1542 
1543    Input Arguments:
1544 .  sf - star forest
1545 
1546    Output Arguments:
1547 .  degree - degree of each root vertex
1548 
1549    Level: developer
1550 
1551    Notes:
1552    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1553 
1554 .seealso:
1555 @*/
1556 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1557 {
1558   PetscErrorCode ierr;
1559 
1560   PetscFunctionBegin;
1561   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1562   PetscSFCheckGraphSet(sf,1);
1563   PetscValidPointer(degree,2);
1564   if (!sf->degreeknown) {
1565     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1566     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp,sf->degree,MPI_SUM);CHKERRQ(ierr);
1567     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1568     sf->degreeknown = PETSC_TRUE;
1569   }
1570   *degree = sf->degree;
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 
1575 /*@C
1576    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1577    Each multi-root is assigned index of the corresponding original root.
1578 
1579    Collective
1580 
1581    Input Arguments:
1582 +  sf - star forest
1583 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1584 
1585    Output Arguments:
1586 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1587 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1588 
1589    Level: developer
1590 
1591    Notes:
1592    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1593 
1594 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1595 @*/
1596 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1597 {
1598   PetscSF             msf;
1599   PetscInt            i, j, k, nroots, nmroots;
1600   PetscErrorCode      ierr;
1601 
1602   PetscFunctionBegin;
1603   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1604   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1605   if (nroots) PetscValidIntPointer(degree,2);
1606   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1607   PetscValidPointer(multiRootsOrigNumbering,4);
1608   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1609   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1610   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1611   for (i=0,j=0,k=0; i<nroots; i++) {
1612     if (!degree[i]) continue;
1613     for (j=0; j<degree[i]; j++,k++) {
1614       (*multiRootsOrigNumbering)[k] = i;
1615     }
1616   }
1617 #if defined(PETSC_USE_DEBUG)
1618   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1619 #endif
1620   if (nMultiRoots) *nMultiRoots = nmroots;
1621   PetscFunctionReturn(0);
1622 }
1623 
1624 /*@C
1625    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1626 
1627    Collective
1628 
1629    Input Arguments:
1630 +  sf - star forest
1631 .  unit - data type
1632 -  leafdata - leaf data to gather to roots
1633 
1634    Output Argument:
1635 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1636 
1637    Level: intermediate
1638 
1639 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1640 @*/
1641 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1642 {
1643   PetscErrorCode ierr;
1644   PetscSF        multi;
1645 
1646   PetscFunctionBegin;
1647   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1648   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1649   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1650   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1651   PetscFunctionReturn(0);
1652 }
1653 
1654 /*@C
1655    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1656 
1657    Collective
1658 
1659    Input Arguments:
1660 +  sf - star forest
1661 .  unit - data type
1662 -  leafdata - leaf data to gather to roots
1663 
1664    Output Argument:
1665 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1666 
1667    Level: intermediate
1668 
1669 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1670 @*/
1671 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1672 {
1673   PetscErrorCode ierr;
1674   PetscSF        multi;
1675 
1676   PetscFunctionBegin;
1677   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1678   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1679   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1680   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPIU_REPLACE);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 /*@C
1685    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1686 
1687    Collective
1688 
1689    Input Arguments:
1690 +  sf - star forest
1691 .  unit - data type
1692 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1693 
1694    Output Argument:
1695 .  leafdata - leaf data to be update with personal data from each respective root
1696 
1697    Level: intermediate
1698 
1699 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1700 @*/
1701 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1702 {
1703   PetscErrorCode ierr;
1704   PetscSF        multi;
1705 
1706   PetscFunctionBegin;
1707   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1708   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1709   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1710   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1711   PetscFunctionReturn(0);
1712 }
1713 
1714 /*@C
1715    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1716 
1717    Collective
1718 
1719    Input Arguments:
1720 +  sf - star forest
1721 .  unit - data type
1722 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1723 
1724    Output Argument:
1725 .  leafdata - leaf data to be update with personal data from each respective root
1726 
1727    Level: intermediate
1728 
1729 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1730 @*/
1731 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1732 {
1733   PetscErrorCode ierr;
1734   PetscSF        multi;
1735 
1736   PetscFunctionBegin;
1737   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1738   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1739   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1740   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata);CHKERRQ(ierr);
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1745 {
1746 #if defined(PETSC_USE_DEBUG)
1747   PetscInt        i, n, nleaves;
1748   const PetscInt *ilocal = NULL;
1749   PetscHSetI      seen;
1750   PetscErrorCode  ierr;
1751 
1752   PetscFunctionBegin;
1753   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1754   ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1755   for (i = 0; i < nleaves; i++) {
1756     const PetscInt leaf = ilocal ? ilocal[i] : i;
1757     ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1758   }
1759   ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1760   if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1761   ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1762   PetscFunctionReturn(0);
1763 #else
1764   PetscFunctionBegin;
1765   PetscFunctionReturn(0);
1766 #endif
1767 }
1768 
1769 /*@
1770   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1771 
1772   Input Parameters:
1773 + sfA - The first PetscSF
1774 - sfB - The second PetscSF
1775 
1776   Output Parameters:
1777 . sfBA - The composite SF
1778 
1779   Level: developer
1780 
1781   Notes:
1782   Currently, the two SFs must be defined on congruent communicators and they must be true star
1783   forests, i.e. the same leaf is not connected with different roots.
1784 
1785   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1786   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1787   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1788   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1789 
1790 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1791 @*/
1792 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1793 {
1794   PetscErrorCode    ierr;
1795   const PetscSFNode *remotePointsA,*remotePointsB;
1796   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1797   const PetscInt    *localPointsA,*localPointsB;
1798   PetscInt          *localPointsBA;
1799   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1800   PetscBool         denseB;
1801 
1802   PetscFunctionBegin;
1803   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1804   PetscSFCheckGraphSet(sfA,1);
1805   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1806   PetscSFCheckGraphSet(sfB,2);
1807   PetscCheckSameComm(sfA,1,sfB,2);
1808   PetscValidPointer(sfBA,3);
1809   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1810   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1811 
1812   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1813   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1814   if (localPointsA) {
1815     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1816     for (i=0; i<numRootsB; i++) {
1817       reorderedRemotePointsA[i].rank = -1;
1818       reorderedRemotePointsA[i].index = -1;
1819     }
1820     for (i=0; i<numLeavesA; i++) {
1821       if (localPointsA[i] >= numRootsB) continue;
1822       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1823     }
1824     remotePointsA = reorderedRemotePointsA;
1825   }
1826   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
1827   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
1828   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1829   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf);CHKERRQ(ierr);
1830   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1831 
1832   denseB = (PetscBool)!localPointsB;
1833   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1834     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
1835     else numLeavesBA++;
1836   }
1837   if (denseB) {
1838     localPointsBA  = NULL;
1839     remotePointsBA = leafdataB;
1840   } else {
1841     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
1842     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
1843     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
1844       const PetscInt l = localPointsB ? localPointsB[i] : i;
1845 
1846       if (leafdataB[l-minleaf].rank == -1) continue;
1847       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
1848       localPointsBA[numLeavesBA] = l;
1849       numLeavesBA++;
1850     }
1851     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
1852   }
1853   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1854   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 /*@
1859   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
1860 
1861   Input Parameters:
1862 + sfA - The first PetscSF
1863 - sfB - The second PetscSF
1864 
1865   Output Parameters:
1866 . sfBA - The composite SF.
1867 
1868   Level: developer
1869 
1870   Notes:
1871   Currently, the two SFs must be defined on congruent communicators and they must be true star
1872   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
1873   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
1874 
1875   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
1876   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
1877   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
1878   a Reduce on sfB, on connected roots.
1879 
1880 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
1881 @*/
1882 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1883 {
1884   PetscErrorCode    ierr;
1885   const PetscSFNode *remotePointsA,*remotePointsB;
1886   PetscSFNode       *remotePointsBA;
1887   const PetscInt    *localPointsA,*localPointsB;
1888   PetscSFNode       *reorderedRemotePointsA = NULL;
1889   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
1890 
1891   PetscFunctionBegin;
1892   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1893   PetscSFCheckGraphSet(sfA,1);
1894   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1895   PetscSFCheckGraphSet(sfB,2);
1896   PetscCheckSameComm(sfA,1,sfB,2);
1897   PetscValidPointer(sfBA,3);
1898   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1899   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1900   /* TODO: Check roots of sfB have degree of 1 */
1901 
1902   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
1903   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
1904   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
1905   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
1906   for (i=0; i<maxleaf - minleaf + 1; i++) {
1907     reorderedRemotePointsA[i].rank = -1;
1908     reorderedRemotePointsA[i].index = -1;
1909   }
1910   if (localPointsA) {
1911     for (i=0; i<numLeavesA; i++) {
1912       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
1913       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
1914     }
1915   } else {
1916     for (i=0; i<numLeavesA; i++) {
1917       if (i > maxleaf || i < minleaf) continue;
1918       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
1919     }
1920   }
1921 
1922   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
1923   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
1924   for (i=0; i<numRootsB; i++) {
1925     remotePointsBA[i].rank = -1;
1926     remotePointsBA[i].index = -1;
1927   }
1928 
1929   /* Once we implement the TODO above (check all roots of sfB have degree of 1), we can replace the MPI_MAXLOC
1930      with MPIU_REPLACE. In that case, MPI_MAXLOC and MPIU_REPLACE have the same effect.
1931    */
1932   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1933   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,MPI_MAXLOC);CHKERRQ(ierr);
1934   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
1935   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
1936     if (remotePointsBA[i].rank == -1) continue;
1937     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
1938     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
1939     localPointsBA[numLeavesBA] = i;
1940     numLeavesBA++;
1941   }
1942   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
1943   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
1944   PetscFunctionReturn(0);
1945 }
1946 
1947 /*
1948   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
1949 
1950   Input Parameters:
1951 . sf - The global PetscSF
1952 
1953   Output Parameters:
1954 . out - The local PetscSF
1955  */
1956 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
1957 {
1958   MPI_Comm           comm;
1959   PetscMPIInt        myrank;
1960   const PetscInt     *ilocal;
1961   const PetscSFNode  *iremote;
1962   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
1963   PetscSFNode        *liremote;
1964   PetscSF            lsf;
1965   PetscErrorCode     ierr;
1966 
1967   PetscFunctionBegin;
1968   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1969   if (sf->ops->CreateLocalSF) {
1970     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
1971   } else {
1972     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
1973     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1974     ierr = MPI_Comm_rank(comm,&myrank);CHKERRQ(ierr);
1975 
1976     /* Find out local edges and build a local SF */
1977     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1978     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
1979     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
1980     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
1981 
1982     for (i=j=0; i<nleaves; i++) {
1983       if (iremote[i].rank == (PetscInt)myrank) {
1984         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
1985         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
1986         liremote[j].index = iremote[i].index;
1987         j++;
1988       }
1989     }
1990     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
1991     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1992     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
1993     *out = lsf;
1994   }
1995   PetscFunctionReturn(0);
1996 }
1997 
1998 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
1999 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2000 {
2001   PetscErrorCode     ierr;
2002   PetscMemType       rootmtype,leafmtype;
2003 
2004   PetscFunctionBegin;
2005   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2006   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2007   ierr = PetscLogEventBegin(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2008   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2009   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2010   if (sf->ops->BcastToZero) {
2011     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2012   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2013   ierr = PetscLogEventEnd(PETSCSF_BcastAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
2014   PetscFunctionReturn(0);
2015 }
2016 
2017