xref: /petsc/src/vec/is/sf/interface/sf.c (revision 7734d62ad843dadba6ae2a712063962a509cd058)
1 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
2 #include <petsc/private/hashseti.h>
3 #include <petsc/private/viewerimpl.h>
4 #include <petscctable.h>
5 
6 #if defined(PETSC_HAVE_CUDA)
7   #include <cuda_runtime.h>
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <hip/hip_runtime.h>
12 #endif
13 
14 #if defined(PETSC_CLANG_STATIC_ANALYZER)
15 void PetscSFCheckGraphSet(PetscSF,int);
16 #else
17 #if defined(PETSC_USE_DEBUG)
18 #  define PetscSFCheckGraphSet(sf,arg) do {                          \
19     if (PetscUnlikely(!(sf)->graphset))                              \
20       SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetGraph() or PetscSFSetGraphWithPattern() on argument %d \"%s\" before %s()",(arg),#sf,PETSC_FUNCTION_NAME); \
21   } while (0)
22 #else
23 #  define PetscSFCheckGraphSet(sf,arg) do {} while (0)
24 #endif
25 #endif
26 
27 const char *const PetscSFDuplicateOptions[] = {"CONFONLY","RANKS","GRAPH","PetscSFDuplicateOption","PETSCSF_DUPLICATE_",NULL};
28 
29 /*@
30    PetscSFCreate - create a star forest communication context
31 
32    Collective
33 
34    Input Parameter:
35 .  comm - communicator on which the star forest will operate
36 
37    Output Parameter:
38 .  sf - new star forest context
39 
40    Options Database Keys:
41 +  -sf_type basic     -Use MPI persistent Isend/Irecv for communication (Default)
42 .  -sf_type window    -Use MPI-3 one-sided window for communication
43 -  -sf_type neighbor  -Use MPI-3 neighborhood collectives for communication
44 
45    Level: intermediate
46 
47    Notes:
48    When one knows the communication graph is one of the predefined graph, such as MPI_Alltoall, MPI_Allgatherv,
49    MPI_Gatherv, one can create a PetscSF and then set its graph with PetscSFSetGraphWithPattern(). These special
50    SFs are optimized and they have better performance than general SFs.
51 
52 .seealso: PetscSFSetGraph(), PetscSFSetGraphWithPattern(), PetscSFDestroy()
53 @*/
54 PetscErrorCode PetscSFCreate(MPI_Comm comm,PetscSF *sf)
55 {
56   PetscErrorCode ierr;
57   PetscSF        b;
58 
59   PetscFunctionBegin;
60   PetscValidPointer(sf,2);
61   ierr = PetscSFInitializePackage();CHKERRQ(ierr);
62 
63   ierr = PetscHeaderCreate(b,PETSCSF_CLASSID,"PetscSF","Star Forest","PetscSF",comm,PetscSFDestroy,PetscSFView);CHKERRQ(ierr);
64 
65   b->nroots    = -1;
66   b->nleaves   = -1;
67   b->minleaf   = PETSC_MAX_INT;
68   b->maxleaf   = PETSC_MIN_INT;
69   b->nranks    = -1;
70   b->rankorder = PETSC_TRUE;
71   b->ingroup   = MPI_GROUP_NULL;
72   b->outgroup  = MPI_GROUP_NULL;
73   b->graphset  = PETSC_FALSE;
74 #if defined(PETSC_HAVE_DEVICE)
75   b->use_gpu_aware_mpi    = use_gpu_aware_mpi;
76   b->use_stream_aware_mpi = PETSC_FALSE;
77   b->unknown_input_stream= PETSC_FALSE;
78   #if defined(PETSC_HAVE_KOKKOS) /* Prefer kokkos over cuda*/
79     b->backend = PETSCSF_BACKEND_KOKKOS;
80   #elif defined(PETSC_HAVE_CUDA)
81     b->backend = PETSCSF_BACKEND_CUDA;
82   #elif defined(PETSC_HAVE_HIP)
83     b->backend = PETSCSF_BACKEND_HIP;
84   #endif
85 
86   #if defined(PETSC_HAVE_NVSHMEM)
87     b->use_nvshmem     = PETSC_FALSE; /* Default is not to try NVSHMEM */
88     b->use_nvshmem_get = PETSC_FALSE; /* Default is to use nvshmem_put based protocol */
89     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&b->use_nvshmem,NULL);CHKERRQ(ierr);
90     ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem_get",&b->use_nvshmem_get,NULL);CHKERRQ(ierr);
91   #endif
92 #endif
93   b->vscat.from_n = -1;
94   b->vscat.to_n   = -1;
95   b->vscat.unit   = MPIU_SCALAR;
96  *sf = b;
97   PetscFunctionReturn(0);
98 }
99 
100 /*@
101    PetscSFReset - Reset a star forest so that different sizes or neighbors can be used
102 
103    Collective
104 
105    Input Parameter:
106 .  sf - star forest
107 
108    Level: advanced
109 
110 .seealso: PetscSFCreate(), PetscSFSetGraph(), PetscSFDestroy()
111 @*/
112 PetscErrorCode PetscSFReset(PetscSF sf)
113 {
114   PetscErrorCode ierr;
115 
116   PetscFunctionBegin;
117   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
118   if (sf->ops->Reset) {ierr = (*sf->ops->Reset)(sf);CHKERRQ(ierr);}
119   sf->nroots   = -1;
120   sf->nleaves  = -1;
121   sf->minleaf  = PETSC_MAX_INT;
122   sf->maxleaf  = PETSC_MIN_INT;
123   sf->mine     = NULL;
124   sf->remote   = NULL;
125   sf->graphset = PETSC_FALSE;
126   ierr = PetscFree(sf->mine_alloc);CHKERRQ(ierr);
127   ierr = PetscFree(sf->remote_alloc);CHKERRQ(ierr);
128   sf->nranks = -1;
129   ierr = PetscFree4(sf->ranks,sf->roffset,sf->rmine,sf->rremote);CHKERRQ(ierr);
130   sf->degreeknown = PETSC_FALSE;
131   ierr = PetscFree(sf->degree);CHKERRQ(ierr);
132   if (sf->ingroup  != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->ingroup);CHKERRMPI(ierr);}
133   if (sf->outgroup != MPI_GROUP_NULL) {ierr = MPI_Group_free(&sf->outgroup);CHKERRMPI(ierr);}
134   if (sf->multi) sf->multi->multi = NULL;
135   ierr = PetscSFDestroy(&sf->multi);CHKERRQ(ierr);
136   ierr = PetscLayoutDestroy(&sf->map);CHKERRQ(ierr);
137 
138  #if defined(PETSC_HAVE_DEVICE)
139   for (PetscInt i=0; i<2; i++) {ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,sf->rmine_d[i]);CHKERRQ(ierr);}
140  #endif
141 
142   sf->setupcalled = PETSC_FALSE;
143   PetscFunctionReturn(0);
144 }
145 
146 /*@C
147    PetscSFSetType - Set the PetscSF communication implementation
148 
149    Collective on PetscSF
150 
151    Input Parameters:
152 +  sf - the PetscSF context
153 -  type - a known method
154 
155    Options Database Key:
156 .  -sf_type <type> - Sets the method; use -help for a list
157    of available methods (for instance, window, basic, neighbor)
158 
159    Notes:
160    See "include/petscsf.h" for available methods (for instance)
161 +    PETSCSFWINDOW - MPI-2/3 one-sided
162 -    PETSCSFBASIC - basic implementation using MPI-1 two-sided
163 
164   Level: intermediate
165 
166 .seealso: PetscSFType, PetscSFCreate()
167 @*/
168 PetscErrorCode PetscSFSetType(PetscSF sf,PetscSFType type)
169 {
170   PetscErrorCode ierr,(*r)(PetscSF);
171   PetscBool      match;
172 
173   PetscFunctionBegin;
174   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
175   PetscValidCharPointer(type,2);
176 
177   ierr = PetscObjectTypeCompare((PetscObject)sf,type,&match);CHKERRQ(ierr);
178   if (match) PetscFunctionReturn(0);
179 
180   ierr = PetscFunctionListFind(PetscSFList,type,&r);CHKERRQ(ierr);
181   if (!r) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_UNKNOWN_TYPE,"Unable to find requested PetscSF type %s",type);
182   /* Destroy the previous PetscSF implementation context */
183   if (sf->ops->Destroy) {ierr = (*(sf)->ops->Destroy)(sf);CHKERRQ(ierr);}
184   ierr = PetscMemzero(sf->ops,sizeof(*sf->ops));CHKERRQ(ierr);
185   ierr = PetscObjectChangeTypeName((PetscObject)sf,type);CHKERRQ(ierr);
186   ierr = (*r)(sf);CHKERRQ(ierr);
187   PetscFunctionReturn(0);
188 }
189 
190 /*@C
191   PetscSFGetType - Get the PetscSF communication implementation
192 
193   Not Collective
194 
195   Input Parameter:
196 . sf  - the PetscSF context
197 
198   Output Parameter:
199 . type - the PetscSF type name
200 
201   Level: intermediate
202 
203 .seealso: PetscSFSetType(), PetscSFCreate()
204 @*/
205 PetscErrorCode PetscSFGetType(PetscSF sf, PetscSFType *type)
206 {
207   PetscFunctionBegin;
208   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID,1);
209   PetscValidPointer(type,2);
210   *type = ((PetscObject)sf)->type_name;
211   PetscFunctionReturn(0);
212 }
213 
214 /*@C
215    PetscSFDestroy - destroy star forest
216 
217    Collective
218 
219    Input Parameter:
220 .  sf - address of star forest
221 
222    Level: intermediate
223 
224 .seealso: PetscSFCreate(), PetscSFReset()
225 @*/
226 PetscErrorCode PetscSFDestroy(PetscSF *sf)
227 {
228   PetscErrorCode ierr;
229 
230   PetscFunctionBegin;
231   if (!*sf) PetscFunctionReturn(0);
232   PetscValidHeaderSpecific((*sf),PETSCSF_CLASSID,1);
233   if (--((PetscObject)(*sf))->refct > 0) {*sf = NULL; PetscFunctionReturn(0);}
234   ierr = PetscSFReset(*sf);CHKERRQ(ierr);
235   if ((*sf)->ops->Destroy) {ierr = (*(*sf)->ops->Destroy)(*sf);CHKERRQ(ierr);}
236   ierr = PetscSFDestroy(&(*sf)->vscat.lsf);CHKERRQ(ierr);
237   if ((*sf)->vscat.bs > 1) {ierr = MPI_Type_free(&(*sf)->vscat.unit);CHKERRMPI(ierr);}
238   ierr = PetscHeaderDestroy(sf);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 static PetscErrorCode PetscSFCheckGraphValid_Private(PetscSF sf)
243 {
244   PetscInt           i, nleaves;
245   PetscMPIInt        size;
246   const PetscInt    *ilocal;
247   const PetscSFNode *iremote;
248   PetscErrorCode     ierr;
249 
250   PetscFunctionBegin;
251   if (!sf->graphset || !PetscDefined(USE_DEBUG)) PetscFunctionReturn(0);
252   ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
253   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
254   for (i = 0; i < nleaves; i++) {
255     const PetscInt rank = iremote[i].rank;
256     const PetscInt remote = iremote[i].index;
257     const PetscInt leaf = ilocal ? ilocal[i] : i;
258     if (rank < 0 || rank >= size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided rank (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be in [0, %d)",rank,i,size);
259     if (remote < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided index (%" PetscInt_FMT ") for remote %" PetscInt_FMT " is invalid, should be >= 0",remote,i);
260     if (leaf < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided location (%" PetscInt_FMT ") for leaf %" PetscInt_FMT " is invalid, should be >= 0",leaf,i);
261   }
262   PetscFunctionReturn(0);
263 }
264 
265 /*@
266    PetscSFSetUp - set up communication structures
267 
268    Collective
269 
270    Input Parameter:
271 .  sf - star forest communication object
272 
273    Level: beginner
274 
275 .seealso: PetscSFSetFromOptions(), PetscSFSetType()
276 @*/
277 PetscErrorCode PetscSFSetUp(PetscSF sf)
278 {
279   PetscErrorCode ierr;
280 
281   PetscFunctionBegin;
282   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
283   PetscSFCheckGraphSet(sf,1);
284   if (sf->setupcalled) PetscFunctionReturn(0);
285   ierr = PetscLogEventBegin(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
286   ierr = PetscSFCheckGraphValid_Private(sf);CHKERRQ(ierr);
287   if (!((PetscObject)sf)->type_name) {ierr = PetscSFSetType(sf,PETSCSFBASIC);CHKERRQ(ierr);} /* Zero all sf->ops */
288   if (sf->ops->SetUp) {ierr = (*sf->ops->SetUp)(sf);CHKERRQ(ierr);}
289 #if defined(PETSC_HAVE_CUDA)
290   if (sf->backend == PETSCSF_BACKEND_CUDA) {
291     sf->ops->Malloc = PetscSFMalloc_CUDA;
292     sf->ops->Free   = PetscSFFree_CUDA;
293   }
294 #endif
295 #if defined(PETSC_HAVE_HIP)
296   if (sf->backend == PETSCSF_BACKEND_HIP) {
297     sf->ops->Malloc = PetscSFMalloc_HIP;
298     sf->ops->Free   = PetscSFFree_HIP;
299   }
300 #endif
301 
302 #
303 #if defined(PETSC_HAVE_KOKKOS)
304   if (sf->backend == PETSCSF_BACKEND_KOKKOS) {
305     sf->ops->Malloc = PetscSFMalloc_Kokkos;
306     sf->ops->Free   = PetscSFFree_Kokkos;
307   }
308 #endif
309   ierr = PetscLogEventEnd(PETSCSF_SetUp,sf,0,0,0);CHKERRQ(ierr);
310   sf->setupcalled = PETSC_TRUE;
311   PetscFunctionReturn(0);
312 }
313 
314 /*@
315    PetscSFSetFromOptions - set PetscSF options using the options database
316 
317    Logically Collective
318 
319    Input Parameter:
320 .  sf - star forest
321 
322    Options Database Keys:
323 +  -sf_type               - implementation type, see PetscSFSetType()
324 .  -sf_rank_order         - sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise
325 .  -sf_use_default_stream - Assume callers of SF computed the input root/leafdata with the default cuda stream. SF will also
326                             use the default stream to process data. Therefore, no stream synchronization is needed between SF and its caller (default: true).
327                             If true, this option only works with -use_gpu_aware_mpi 1.
328 .  -sf_use_stream_aware_mpi  - Assume the underlying MPI is cuda-stream aware and SF won't sync streams for send/recv buffers passed to MPI (default: false).
329                                If true, this option only works with -use_gpu_aware_mpi 1.
330 
331 -  -sf_backend cuda | hip | kokkos -Select the device backend SF uses. Currently SF has these backends: cuda, hip and Kokkos.
332                               On CUDA (HIP) devices, one can choose cuda (hip) or kokkos with the default being kokkos. On other devices,
333                               the only available is kokkos.
334 
335    Level: intermediate
336 @*/
337 PetscErrorCode PetscSFSetFromOptions(PetscSF sf)
338 {
339   PetscSFType    deft;
340   char           type[256];
341   PetscErrorCode ierr;
342   PetscBool      flg;
343 
344   PetscFunctionBegin;
345   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
346   ierr = PetscObjectOptionsBegin((PetscObject)sf);CHKERRQ(ierr);
347   deft = ((PetscObject)sf)->type_name ? ((PetscObject)sf)->type_name : PETSCSFBASIC;
348   ierr = PetscOptionsFList("-sf_type","PetscSF implementation type","PetscSFSetType",PetscSFList,deft,type,sizeof(type),&flg);CHKERRQ(ierr);
349   ierr = PetscSFSetType(sf,flg ? type : deft);CHKERRQ(ierr);
350   ierr = PetscOptionsBool("-sf_rank_order","sort composite points for gathers and scatters in rank order, gathers are non-deterministic otherwise","PetscSFSetRankOrder",sf->rankorder,&sf->rankorder,NULL);CHKERRQ(ierr);
351  #if defined(PETSC_HAVE_DEVICE)
352   {
353     char        backendstr[32] = {0};
354     PetscBool   isCuda = PETSC_FALSE,isHip = PETSC_FALSE,isKokkos = PETSC_FALSE,set;
355     /* Change the defaults set in PetscSFCreate() with command line options */
356     ierr = PetscOptionsBool("-sf_unknown_input_stream","SF root/leafdata is computed on arbitary streams unknown to SF","PetscSFSetFromOptions",sf->unknown_input_stream,&sf->unknown_input_stream,NULL);CHKERRQ(ierr);
357     ierr = PetscOptionsBool("-sf_use_stream_aware_mpi","Assume the underlying MPI is cuda-stream aware","PetscSFSetFromOptions",sf->use_stream_aware_mpi,&sf->use_stream_aware_mpi,NULL);CHKERRQ(ierr);
358     ierr = PetscOptionsString("-sf_backend","Select the device backend SF uses","PetscSFSetFromOptions",NULL,backendstr,sizeof(backendstr),&set);CHKERRQ(ierr);
359     ierr = PetscStrcasecmp("cuda",backendstr,&isCuda);CHKERRQ(ierr);
360     ierr = PetscStrcasecmp("kokkos",backendstr,&isKokkos);CHKERRQ(ierr);
361     ierr = PetscStrcasecmp("hip",backendstr,&isHip);CHKERRQ(ierr);
362   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
363     if (isCuda) sf->backend = PETSCSF_BACKEND_CUDA;
364     else if (isKokkos) sf->backend = PETSCSF_BACKEND_KOKKOS;
365     else if (isHip) sf->backend = PETSCSF_BACKEND_HIP;
366     else if (set) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You may choose cuda, hip or kokkos (if installed)", backendstr);
367   #elif defined(PETSC_HAVE_KOKKOS)
368     if (set && !isKokkos) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"-sf_backend %s is not supported. You can only choose kokkos", backendstr);
369    #endif
370   }
371  #endif
372   if (sf->ops->SetFromOptions) {ierr = (*sf->ops->SetFromOptions)(PetscOptionsObject,sf);CHKERRQ(ierr);}
373   ierr = PetscOptionsEnd();CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 /*@
378    PetscSFSetRankOrder - sort multi-points for gathers and scatters by rank order
379 
380    Logically Collective
381 
382    Input Parameters:
383 +  sf - star forest
384 -  flg - PETSC_TRUE to sort, PETSC_FALSE to skip sorting (lower setup cost, but non-deterministic)
385 
386    Level: advanced
387 
388 .seealso: PetscSFGatherBegin(), PetscSFScatterBegin()
389 @*/
390 PetscErrorCode PetscSFSetRankOrder(PetscSF sf,PetscBool flg)
391 {
392   PetscFunctionBegin;
393   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
394   PetscValidLogicalCollectiveBool(sf,flg,2);
395   if (sf->multi) SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_WRONGSTATE,"Rank ordering must be set before first call to PetscSFGatherBegin() or PetscSFScatterBegin()");
396   sf->rankorder = flg;
397   PetscFunctionReturn(0);
398 }
399 
400 /*@C
401    PetscSFSetGraph - Set a parallel star forest
402 
403    Collective
404 
405    Input Parameters:
406 +  sf - star forest
407 .  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
408 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
409 .  ilocal - locations of leaves in leafdata buffers, pass NULL for contiguous storage (locations must be >= 0, enforced
410 during setup in debug mode)
411 .  localmode - copy mode for ilocal
412 .  iremote - remote locations of root vertices for each leaf on the current process (locations must be >= 0, enforced
413 during setup in debug mode)
414 -  remotemode - copy mode for iremote
415 
416    Level: intermediate
417 
418    Notes:
419     In Fortran you must use PETSC_COPY_VALUES for localmode and remotemode
420 
421    Developers Note: Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
422    encode contiguous storage. In such case, if localmode is PETSC_OWN_POINTER, the memory is deallocated as it is not
423    needed
424 
425    Developers Note: This object does not necessarily encode a true star forest in the graph theoretic sense, since leaf
426    indices are not required to be unique. Some functions, however, rely on unique leaf indices (checked in debug mode).
427 
428 .seealso: PetscSFCreate(), PetscSFView(), PetscSFGetGraph()
429 @*/
430 PetscErrorCode PetscSFSetGraph(PetscSF sf,PetscInt nroots,PetscInt nleaves,const PetscInt *ilocal,PetscCopyMode localmode,const PetscSFNode *iremote,PetscCopyMode remotemode)
431 {
432   PetscErrorCode ierr;
433 
434   PetscFunctionBegin;
435   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
436   if (nleaves > 0 && ilocal) PetscValidIntPointer(ilocal,4);
437   if (nleaves > 0) PetscValidPointer(iremote,6);
438   if (nroots  < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nroots %" PetscInt_FMT ", cannot be negative",nroots);
439   if (nleaves < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"nleaves %" PetscInt_FMT ", cannot be negative",nleaves);
440 
441   if (sf->nroots >= 0) { /* Reset only if graph already set */
442     ierr = PetscSFReset(sf);CHKERRQ(ierr);
443   }
444 
445   ierr = PetscLogEventBegin(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
446 
447   sf->nroots  = nroots;
448   sf->nleaves = nleaves;
449 
450   if (nleaves && ilocal) {
451     PetscInt i;
452     PetscInt minleaf = PETSC_MAX_INT;
453     PetscInt maxleaf = PETSC_MIN_INT;
454     int      contiguous = 1;
455     for (i=0; i<nleaves; i++) {
456       minleaf = PetscMin(minleaf,ilocal[i]);
457       maxleaf = PetscMax(maxleaf,ilocal[i]);
458       contiguous &= (ilocal[i] == i);
459     }
460     sf->minleaf = minleaf;
461     sf->maxleaf = maxleaf;
462     if (contiguous) {
463       if (localmode == PETSC_OWN_POINTER) {
464         ierr = PetscFree(ilocal);CHKERRQ(ierr);
465       }
466       ilocal = NULL;
467     }
468   } else {
469     sf->minleaf = 0;
470     sf->maxleaf = nleaves - 1;
471   }
472 
473   if (ilocal) {
474     switch (localmode) {
475     case PETSC_COPY_VALUES:
476       ierr = PetscMalloc1(nleaves,&sf->mine_alloc);CHKERRQ(ierr);
477       ierr = PetscArraycpy(sf->mine_alloc,ilocal,nleaves);CHKERRQ(ierr);
478       sf->mine = sf->mine_alloc;
479       break;
480     case PETSC_OWN_POINTER:
481       sf->mine_alloc = (PetscInt*)ilocal;
482       sf->mine       = sf->mine_alloc;
483       break;
484     case PETSC_USE_POINTER:
485       sf->mine_alloc = NULL;
486       sf->mine       = (PetscInt*)ilocal;
487       break;
488     default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown localmode");
489     }
490   }
491 
492   switch (remotemode) {
493   case PETSC_COPY_VALUES:
494     ierr = PetscMalloc1(nleaves,&sf->remote_alloc);CHKERRQ(ierr);
495     ierr = PetscArraycpy(sf->remote_alloc,iremote,nleaves);CHKERRQ(ierr);
496     sf->remote = sf->remote_alloc;
497     break;
498   case PETSC_OWN_POINTER:
499     sf->remote_alloc = (PetscSFNode*)iremote;
500     sf->remote       = sf->remote_alloc;
501     break;
502   case PETSC_USE_POINTER:
503     sf->remote_alloc = NULL;
504     sf->remote       = (PetscSFNode*)iremote;
505     break;
506   default: SETERRQ(PetscObjectComm((PetscObject)sf),PETSC_ERR_ARG_OUTOFRANGE,"Unknown remotemode");
507   }
508 
509   ierr = PetscLogEventEnd(PETSCSF_SetGraph,sf,0,0,0);CHKERRQ(ierr);
510   sf->graphset = PETSC_TRUE;
511   PetscFunctionReturn(0);
512 }
513 
514 /*@
515   PetscSFSetGraphWithPattern - Sets the graph of an SF with a specific pattern
516 
517   Collective
518 
519   Input Parameters:
520 + sf      - The PetscSF
521 . map     - Layout of roots over all processes (insignificant when pattern is PETSCSF_PATTERN_ALLTOALL)
522 - pattern - One of PETSCSF_PATTERN_ALLGATHER, PETSCSF_PATTERN_GATHER, PETSCSF_PATTERN_ALLTOALL
523 
524   Notes:
525   It is easier to explain PetscSFPattern using vectors. Suppose we have an MPI vector x and its layout is map.
526   n and N are local and global sizes of x respectively.
527 
528   With PETSCSF_PATTERN_ALLGATHER, the routine creates a graph that if one does Bcast on it, it will copy x to
529   sequential vectors y on all ranks.
530 
531   With PETSCSF_PATTERN_GATHER, the routine creates a graph that if one does Bcast on it, it will copy x to a
532   sequential vector y on rank 0.
533 
534   In above cases, entries of x are roots and entries of y are leaves.
535 
536   With PETSCSF_PATTERN_ALLTOALL, map is insignificant. Suppose NP is size of sf's communicator. The routine
537   creates a graph that every rank has NP leaves and NP roots. On rank i, its leaf j is connected to root i
538   of rank j. Here 0 <=i,j<NP. It is a kind of MPI_Alltoall with sendcount/recvcount being 1. Note that it does
539   not mean one can not send multiple items. One just needs to create a new MPI datatype for the mulptiple data
540   items with MPI_Type_contiguous() and use that as the <unit> argument in SF routines.
541 
542   In this case, roots and leaves are symmetric.
543 
544   Level: intermediate
545  @*/
546 PetscErrorCode PetscSFSetGraphWithPattern(PetscSF sf,PetscLayout map,PetscSFPattern pattern)
547 {
548   MPI_Comm       comm;
549   PetscInt       n,N,res[2];
550   PetscMPIInt    rank,size;
551   PetscSFType    type;
552   PetscErrorCode ierr;
553 
554   PetscFunctionBegin;
555   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
556   if (pattern != PETSCSF_PATTERN_ALLTOALL) PetscValidPointer(map,2);
557   ierr = PetscObjectGetComm((PetscObject)sf, &comm);CHKERRQ(ierr);
558   if (PetscUnlikely((pattern < PETSCSF_PATTERN_ALLGATHER) || (pattern > PETSCSF_PATTERN_ALLTOALL))) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Unsupported PetscSFPattern %d",pattern);
559   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
560   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
561 
562   if (pattern == PETSCSF_PATTERN_ALLTOALL) {
563     type = PETSCSFALLTOALL;
564     ierr = PetscLayoutCreate(comm,&sf->map);CHKERRQ(ierr);
565     ierr = PetscLayoutSetLocalSize(sf->map,size);CHKERRQ(ierr);
566     ierr = PetscLayoutSetSize(sf->map,((PetscInt)size)*size);CHKERRQ(ierr);
567     ierr = PetscLayoutSetUp(sf->map);CHKERRQ(ierr);
568   } else {
569     ierr   = PetscLayoutGetLocalSize(map,&n);CHKERRQ(ierr);
570     ierr   = PetscLayoutGetSize(map,&N);CHKERRQ(ierr);
571     res[0] = n;
572     res[1] = -n;
573     /* Check if n are same over all ranks so that we can optimize it */
574     ierr   = MPIU_Allreduce(MPI_IN_PLACE,res,2,MPIU_INT,MPI_MAX,comm);CHKERRMPI(ierr);
575     if (res[0] == -res[1]) { /* same n */
576       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHER  : PETSCSFGATHER;
577     } else {
578       type = (pattern == PETSCSF_PATTERN_ALLGATHER) ? PETSCSFALLGATHERV : PETSCSFGATHERV;
579     }
580     ierr = PetscLayoutReference(map,&sf->map);CHKERRQ(ierr);
581   }
582   ierr = PetscSFSetType(sf,type);CHKERRQ(ierr);
583 
584   sf->pattern = pattern;
585   sf->mine    = NULL; /* Contiguous */
586 
587   /* Set nleaves, nroots here in case user calls PetscSFGetGraph, which is legal to call even before PetscSFSetUp is called.
588      Also set other easy stuff.
589    */
590   if (pattern == PETSCSF_PATTERN_ALLGATHER) {
591     sf->nleaves      = N;
592     sf->nroots       = n;
593     sf->nranks       = size;
594     sf->minleaf      = 0;
595     sf->maxleaf      = N - 1;
596   } else if (pattern == PETSCSF_PATTERN_GATHER) {
597     sf->nleaves      = rank ? 0 : N;
598     sf->nroots       = n;
599     sf->nranks       = rank ? 0 : size;
600     sf->minleaf      = 0;
601     sf->maxleaf      = rank ? -1 : N - 1;
602   } else if (pattern == PETSCSF_PATTERN_ALLTOALL) {
603     sf->nleaves      = size;
604     sf->nroots       = size;
605     sf->nranks       = size;
606     sf->minleaf      = 0;
607     sf->maxleaf      = size - 1;
608   }
609   sf->ndranks  = 0; /* We do not need to separate out distinguished ranks for patterned graphs to improve communication performance */
610   sf->graphset = PETSC_TRUE;
611   PetscFunctionReturn(0);
612 }
613 
614 /*@
615    PetscSFCreateInverseSF - given a PetscSF in which all vertices have degree 1, creates the inverse map
616 
617    Collective
618 
619    Input Parameter:
620 .  sf - star forest to invert
621 
622    Output Parameter:
623 .  isf - inverse of sf
624 
625    Level: advanced
626 
627    Notes:
628    All roots must have degree 1.
629 
630    The local space may be a permutation, but cannot be sparse.
631 
632 .seealso: PetscSFSetGraph()
633 @*/
634 PetscErrorCode PetscSFCreateInverseSF(PetscSF sf,PetscSF *isf)
635 {
636   PetscErrorCode ierr;
637   PetscMPIInt    rank;
638   PetscInt       i,nroots,nleaves,maxlocal,count,*newilocal;
639   const PetscInt *ilocal;
640   PetscSFNode    *roots,*leaves;
641 
642   PetscFunctionBegin;
643   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
644   PetscSFCheckGraphSet(sf,1);
645   PetscValidPointer(isf,2);
646 
647   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
648   maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
649 
650   ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
651   ierr = PetscMalloc2(nroots,&roots,maxlocal,&leaves);CHKERRQ(ierr);
652   for (i=0; i<maxlocal; i++) {
653     leaves[i].rank  = rank;
654     leaves[i].index = i;
655   }
656   for (i=0; i <nroots; i++) {
657     roots[i].rank  = -1;
658     roots[i].index = -1;
659   }
660   ierr = PetscSFReduceBegin(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
661   ierr = PetscSFReduceEnd(sf,MPIU_2INT,leaves,roots,MPI_REPLACE);CHKERRQ(ierr);
662 
663   /* Check whether our leaves are sparse */
664   for (i=0,count=0; i<nroots; i++) if (roots[i].rank >= 0) count++;
665   if (count == nroots) newilocal = NULL;
666   else {                        /* Index for sparse leaves and compact "roots" array (which is to become our leaves). */
667     ierr = PetscMalloc1(count,&newilocal);CHKERRQ(ierr);
668     for (i=0,count=0; i<nroots; i++) {
669       if (roots[i].rank >= 0) {
670         newilocal[count]   = i;
671         roots[count].rank  = roots[i].rank;
672         roots[count].index = roots[i].index;
673         count++;
674       }
675     }
676   }
677 
678   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,isf);CHKERRQ(ierr);
679   ierr = PetscSFSetGraph(*isf,maxlocal,count,newilocal,PETSC_OWN_POINTER,roots,PETSC_COPY_VALUES);CHKERRQ(ierr);
680   ierr = PetscFree2(roots,leaves);CHKERRQ(ierr);
681   PetscFunctionReturn(0);
682 }
683 
684 /*@
685    PetscSFDuplicate - duplicate a PetscSF, optionally preserving rank connectivity and graph
686 
687    Collective
688 
689    Input Parameters:
690 +  sf - communication object to duplicate
691 -  opt - PETSCSF_DUPLICATE_CONFONLY, PETSCSF_DUPLICATE_RANKS, or PETSCSF_DUPLICATE_GRAPH (see PetscSFDuplicateOption)
692 
693    Output Parameter:
694 .  newsf - new communication object
695 
696    Level: beginner
697 
698 .seealso: PetscSFCreate(), PetscSFSetType(), PetscSFSetGraph()
699 @*/
700 PetscErrorCode PetscSFDuplicate(PetscSF sf,PetscSFDuplicateOption opt,PetscSF *newsf)
701 {
702   PetscSFType    type;
703   PetscErrorCode ierr;
704   MPI_Datatype   dtype=MPIU_SCALAR;
705 
706   PetscFunctionBegin;
707   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
708   PetscValidLogicalCollectiveEnum(sf,opt,2);
709   PetscValidPointer(newsf,3);
710   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sf),newsf);CHKERRQ(ierr);
711   ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
712   if (type) {ierr = PetscSFSetType(*newsf,type);CHKERRQ(ierr);}
713   if (opt == PETSCSF_DUPLICATE_GRAPH) {
714     PetscSFCheckGraphSet(sf,1);
715     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
716       PetscInt          nroots,nleaves;
717       const PetscInt    *ilocal;
718       const PetscSFNode *iremote;
719       ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
720       ierr = PetscSFSetGraph(*newsf,nroots,nleaves,ilocal,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr);
721     } else {
722       ierr = PetscSFSetGraphWithPattern(*newsf,sf->map,sf->pattern);CHKERRQ(ierr);
723     }
724   }
725   /* Since oldtype is committed, so is newtype, according to MPI */
726   if (sf->vscat.bs > 1) {ierr = MPI_Type_dup(sf->vscat.unit,&dtype);CHKERRMPI(ierr);}
727   (*newsf)->vscat.bs     = sf->vscat.bs;
728   (*newsf)->vscat.unit   = dtype;
729   (*newsf)->vscat.to_n   = sf->vscat.to_n;
730   (*newsf)->vscat.from_n = sf->vscat.from_n;
731   /* Do not copy lsf. Build it on demand since it is rarely used */
732 
733 #if defined(PETSC_HAVE_DEVICE)
734   (*newsf)->backend              = sf->backend;
735   (*newsf)->unknown_input_stream= sf->unknown_input_stream;
736   (*newsf)->use_gpu_aware_mpi    = sf->use_gpu_aware_mpi;
737   (*newsf)->use_stream_aware_mpi = sf->use_stream_aware_mpi;
738 #endif
739   if (sf->ops->Duplicate) {ierr = (*sf->ops->Duplicate)(sf,opt,*newsf);CHKERRQ(ierr);}
740   /* Don't do PetscSFSetUp() since the new sf's graph might have not been set. */
741   PetscFunctionReturn(0);
742 }
743 
744 /*@C
745    PetscSFGetGraph - Get the graph specifying a parallel star forest
746 
747    Not Collective
748 
749    Input Parameter:
750 .  sf - star forest
751 
752    Output Parameters:
753 +  nroots - number of root vertices on the current process (these are possible targets for other process to attach leaves)
754 .  nleaves - number of leaf vertices on the current process, each of these references a root on any process
755 .  ilocal - locations of leaves in leafdata buffers (if returned value is NULL, it means leaves are in contiguous storage)
756 -  iremote - remote locations of root vertices for each leaf on the current process
757 
758    Notes:
759      We are not currently requiring that the graph is set, thus returning nroots=-1 if it has not been set yet
760 
761    Fortran Notes:
762      The returned iremote array is a copy and must be deallocated after use. Consequently, if you
763      want to update the graph, you must call PetscSFSetGraph() after modifying the iremote array.
764 
765      To check for a NULL ilocal use
766 $      if (loc(ilocal) == loc(PETSC_NULL_INTEGER)) then
767 
768    Level: intermediate
769 
770 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph()
771 @*/
772 PetscErrorCode PetscSFGetGraph(PetscSF sf,PetscInt *nroots,PetscInt *nleaves,const PetscInt **ilocal,const PetscSFNode **iremote)
773 {
774   PetscErrorCode ierr;
775 
776   PetscFunctionBegin;
777   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
778   if (sf->ops->GetGraph) {
779     ierr = (sf->ops->GetGraph)(sf,nroots,nleaves,ilocal,iremote);CHKERRQ(ierr);
780   } else {
781     if (nroots) *nroots = sf->nroots;
782     if (nleaves) *nleaves = sf->nleaves;
783     if (ilocal) *ilocal = sf->mine;
784     if (iremote) *iremote = sf->remote;
785   }
786   PetscFunctionReturn(0);
787 }
788 
789 /*@
790    PetscSFGetLeafRange - Get the active leaf ranges
791 
792    Not Collective
793 
794    Input Parameter:
795 .  sf - star forest
796 
797    Output Parameters:
798 +  minleaf - minimum active leaf on this process. Return 0 if there are no leaves.
799 -  maxleaf - maximum active leaf on this process. Return -1 if there are no leaves.
800 
801    Level: developer
802 
803 .seealso: PetscSFCreate(), PetscSFView(), PetscSFSetGraph(), PetscSFGetGraph()
804 @*/
805 PetscErrorCode PetscSFGetLeafRange(PetscSF sf,PetscInt *minleaf,PetscInt *maxleaf)
806 {
807   PetscFunctionBegin;
808   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
809   PetscSFCheckGraphSet(sf,1);
810   if (minleaf) *minleaf = sf->minleaf;
811   if (maxleaf) *maxleaf = sf->maxleaf;
812   PetscFunctionReturn(0);
813 }
814 
815 /*@C
816    PetscSFViewFromOptions - View from Options
817 
818    Collective on PetscSF
819 
820    Input Parameters:
821 +  A - the star forest
822 .  obj - Optional object
823 -  name - command line option
824 
825    Level: intermediate
826 .seealso:  PetscSF, PetscSFView, PetscObjectViewFromOptions(), PetscSFCreate()
827 @*/
828 PetscErrorCode  PetscSFViewFromOptions(PetscSF A,PetscObject obj,const char name[])
829 {
830   PetscErrorCode ierr;
831 
832   PetscFunctionBegin;
833   PetscValidHeaderSpecific(A,PETSCSF_CLASSID,1);
834   ierr = PetscObjectViewFromOptions((PetscObject)A,obj,name);CHKERRQ(ierr);
835   PetscFunctionReturn(0);
836 }
837 
838 /*@C
839    PetscSFView - view a star forest
840 
841    Collective
842 
843    Input Parameters:
844 +  sf - star forest
845 -  viewer - viewer to display graph, for example PETSC_VIEWER_STDOUT_WORLD
846 
847    Level: beginner
848 
849 .seealso: PetscSFCreate(), PetscSFSetGraph()
850 @*/
851 PetscErrorCode PetscSFView(PetscSF sf,PetscViewer viewer)
852 {
853   PetscErrorCode    ierr;
854   PetscBool         iascii;
855   PetscViewerFormat format;
856 
857   PetscFunctionBegin;
858   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
859   if (!viewer) {ierr = PetscViewerASCIIGetStdout(PetscObjectComm((PetscObject)sf),&viewer);CHKERRQ(ierr);}
860   PetscValidHeaderSpecific(viewer,PETSC_VIEWER_CLASSID,2);
861   PetscCheckSameComm(sf,1,viewer,2);
862   if (sf->graphset) {ierr = PetscSFSetUp(sf);CHKERRQ(ierr);}
863   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
864   if (iascii && viewer->format != PETSC_VIEWER_ASCII_MATLAB) {
865     PetscMPIInt rank;
866     PetscInt    ii,i,j;
867 
868     ierr = PetscObjectPrintClassNamePrefixType((PetscObject)sf,viewer);CHKERRQ(ierr);
869     ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
870     if (sf->pattern == PETSCSF_PATTERN_GENERAL) {
871       if (!sf->graphset) {
872         ierr = PetscViewerASCIIPrintf(viewer,"PetscSFSetGraph() has not been called yet\n");CHKERRQ(ierr);
873         ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
874         PetscFunctionReturn(0);
875       }
876       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
877       ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
878       ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number of roots=%" PetscInt_FMT ", leaves=%" PetscInt_FMT ", remote ranks=%" PetscInt_FMT "\n",rank,sf->nroots,sf->nleaves,sf->nranks);CHKERRQ(ierr);
879       for (i=0; i<sf->nleaves; i++) {
880         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %" PetscInt_FMT " <- (%" PetscInt_FMT ",%" PetscInt_FMT ")\n",rank,sf->mine ? sf->mine[i] : i,sf->remote[i].rank,sf->remote[i].index);CHKERRQ(ierr);
881       }
882       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
883       ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
884       if (format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
885         PetscMPIInt *tmpranks,*perm;
886         ierr = PetscMalloc2(sf->nranks,&tmpranks,sf->nranks,&perm);CHKERRQ(ierr);
887         ierr = PetscArraycpy(tmpranks,sf->ranks,sf->nranks);CHKERRQ(ierr);
888         for (i=0; i<sf->nranks; i++) perm[i] = i;
889         ierr = PetscSortMPIIntWithArray(sf->nranks,tmpranks,perm);CHKERRQ(ierr);
890         ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Roots referenced by my leaves, by rank\n",rank);CHKERRQ(ierr);
891         for (ii=0; ii<sf->nranks; ii++) {
892           i = perm[ii];
893           ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %d: %" PetscInt_FMT " edges\n",rank,sf->ranks[i],sf->roffset[i+1]-sf->roffset[i]);CHKERRQ(ierr);
894           for (j=sf->roffset[i]; j<sf->roffset[i+1]; j++) {
895             ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d]    %" PetscInt_FMT " <- %" PetscInt_FMT "\n",rank,sf->rmine[j],sf->rremote[j]);CHKERRQ(ierr);
896           }
897         }
898         ierr = PetscFree2(tmpranks,perm);CHKERRQ(ierr);
899       }
900       ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
901       ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
902     }
903     ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
904   }
905   if (sf->ops->View) {ierr = (*sf->ops->View)(sf,viewer);CHKERRQ(ierr);}
906   PetscFunctionReturn(0);
907 }
908 
909 /*@C
910    PetscSFGetRootRanks - Get root ranks and number of vertices referenced by leaves on this process
911 
912    Not Collective
913 
914    Input Parameter:
915 .  sf - star forest
916 
917    Output Parameters:
918 +  nranks - number of ranks referenced by local part
919 .  ranks - array of ranks
920 .  roffset - offset in rmine/rremote for each rank (length nranks+1)
921 .  rmine - concatenated array holding local indices referencing each remote rank
922 -  rremote - concatenated array holding remote indices referenced for each remote rank
923 
924    Level: developer
925 
926 .seealso: PetscSFGetLeafRanks()
927 @*/
928 PetscErrorCode PetscSFGetRootRanks(PetscSF sf,PetscInt *nranks,const PetscMPIInt **ranks,const PetscInt **roffset,const PetscInt **rmine,const PetscInt **rremote)
929 {
930   PetscErrorCode ierr;
931 
932   PetscFunctionBegin;
933   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
934   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
935   if (sf->ops->GetRootRanks) {
936     ierr = (sf->ops->GetRootRanks)(sf,nranks,ranks,roffset,rmine,rremote);CHKERRQ(ierr);
937   } else {
938     /* The generic implementation */
939     if (nranks)  *nranks  = sf->nranks;
940     if (ranks)   *ranks   = sf->ranks;
941     if (roffset) *roffset = sf->roffset;
942     if (rmine)   *rmine   = sf->rmine;
943     if (rremote) *rremote = sf->rremote;
944   }
945   PetscFunctionReturn(0);
946 }
947 
948 /*@C
949    PetscSFGetLeafRanks - Get leaf ranks referencing roots on this process
950 
951    Not Collective
952 
953    Input Parameter:
954 .  sf - star forest
955 
956    Output Parameters:
957 +  niranks - number of leaf ranks referencing roots on this process
958 .  iranks - array of ranks
959 .  ioffset - offset in irootloc for each rank (length niranks+1)
960 -  irootloc - concatenated array holding local indices of roots referenced by each leaf rank
961 
962    Level: developer
963 
964 .seealso: PetscSFGetRootRanks()
965 @*/
966 PetscErrorCode PetscSFGetLeafRanks(PetscSF sf,PetscInt *niranks,const PetscMPIInt **iranks,const PetscInt **ioffset,const PetscInt **irootloc)
967 {
968   PetscErrorCode ierr;
969 
970   PetscFunctionBegin;
971   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
972   if (!sf->setupcalled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUp() before obtaining ranks");
973   if (sf->ops->GetLeafRanks) {
974     ierr = (sf->ops->GetLeafRanks)(sf,niranks,iranks,ioffset,irootloc);CHKERRQ(ierr);
975   } else {
976     PetscSFType type;
977     ierr = PetscSFGetType(sf,&type);CHKERRQ(ierr);
978     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFGetLeafRanks() is not supported on this StarForest type: %s", type);
979   }
980   PetscFunctionReturn(0);
981 }
982 
983 static PetscBool InList(PetscMPIInt needle,PetscMPIInt n,const PetscMPIInt *list) {
984   PetscInt i;
985   for (i=0; i<n; i++) {
986     if (needle == list[i]) return PETSC_TRUE;
987   }
988   return PETSC_FALSE;
989 }
990 
991 /*@C
992    PetscSFSetUpRanks - Set up data structures associated with ranks; this is for internal use by PetscSF implementations.
993 
994    Collective
995 
996    Input Parameters:
997 +  sf - PetscSF to set up; PetscSFSetGraph() must have been called
998 -  dgroup - MPI_Group of ranks to be distinguished (e.g., for self or shared memory exchange)
999 
1000    Level: developer
1001 
1002 .seealso: PetscSFGetRootRanks()
1003 @*/
1004 PetscErrorCode PetscSFSetUpRanks(PetscSF sf,MPI_Group dgroup)
1005 {
1006   PetscErrorCode     ierr;
1007   PetscTable         table;
1008   PetscTablePosition pos;
1009   PetscMPIInt        size,groupsize,*groupranks;
1010   PetscInt           *rcount,*ranks;
1011   PetscInt           i, irank = -1,orank = -1;
1012 
1013   PetscFunctionBegin;
1014   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1015   PetscSFCheckGraphSet(sf,1);
1016   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)sf),&size);CHKERRMPI(ierr);
1017   ierr = PetscTableCreate(10,size,&table);CHKERRQ(ierr);
1018   for (i=0; i<sf->nleaves; i++) {
1019     /* Log 1-based rank */
1020     ierr = PetscTableAdd(table,sf->remote[i].rank+1,1,ADD_VALUES);CHKERRQ(ierr);
1021   }
1022   ierr = PetscTableGetCount(table,&sf->nranks);CHKERRQ(ierr);
1023   ierr = PetscMalloc4(sf->nranks,&sf->ranks,sf->nranks+1,&sf->roffset,sf->nleaves,&sf->rmine,sf->nleaves,&sf->rremote);CHKERRQ(ierr);
1024   ierr = PetscMalloc2(sf->nranks,&rcount,sf->nranks,&ranks);CHKERRQ(ierr);
1025   ierr = PetscTableGetHeadPosition(table,&pos);CHKERRQ(ierr);
1026   for (i=0; i<sf->nranks; i++) {
1027     ierr = PetscTableGetNext(table,&pos,&ranks[i],&rcount[i]);CHKERRQ(ierr);
1028     ranks[i]--;             /* Convert back to 0-based */
1029   }
1030   ierr = PetscTableDestroy(&table);CHKERRQ(ierr);
1031 
1032   /* We expect that dgroup is reliably "small" while nranks could be large */
1033   {
1034     MPI_Group group = MPI_GROUP_NULL;
1035     PetscMPIInt *dgroupranks;
1036     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1037     ierr = MPI_Group_size(dgroup,&groupsize);CHKERRMPI(ierr);
1038     ierr = PetscMalloc1(groupsize,&dgroupranks);CHKERRQ(ierr);
1039     ierr = PetscMalloc1(groupsize,&groupranks);CHKERRQ(ierr);
1040     for (i=0; i<groupsize; i++) dgroupranks[i] = i;
1041     if (groupsize) {ierr = MPI_Group_translate_ranks(dgroup,groupsize,dgroupranks,group,groupranks);CHKERRMPI(ierr);}
1042     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1043     ierr = PetscFree(dgroupranks);CHKERRQ(ierr);
1044   }
1045 
1046   /* Partition ranks[] into distinguished (first sf->ndranks) followed by non-distinguished */
1047   for (sf->ndranks=0,i=sf->nranks; sf->ndranks<i;) {
1048     for (i--; sf->ndranks<i; i--) { /* Scan i backward looking for distinguished rank */
1049       if (InList(ranks[i],groupsize,groupranks)) break;
1050     }
1051     for (; sf->ndranks<=i; sf->ndranks++) { /* Scan sf->ndranks forward looking for non-distinguished rank */
1052       if (!InList(ranks[sf->ndranks],groupsize,groupranks)) break;
1053     }
1054     if (sf->ndranks < i) {                         /* Swap ranks[sf->ndranks] with ranks[i] */
1055       PetscInt tmprank,tmpcount;
1056 
1057       tmprank             = ranks[i];
1058       tmpcount            = rcount[i];
1059       ranks[i]            = ranks[sf->ndranks];
1060       rcount[i]           = rcount[sf->ndranks];
1061       ranks[sf->ndranks]  = tmprank;
1062       rcount[sf->ndranks] = tmpcount;
1063       sf->ndranks++;
1064     }
1065   }
1066   ierr = PetscFree(groupranks);CHKERRQ(ierr);
1067   ierr = PetscSortIntWithArray(sf->ndranks,ranks,rcount);CHKERRQ(ierr);
1068   ierr = PetscSortIntWithArray(sf->nranks-sf->ndranks,ranks+sf->ndranks,rcount+sf->ndranks);CHKERRQ(ierr);
1069   sf->roffset[0] = 0;
1070   for (i=0; i<sf->nranks; i++) {
1071     ierr = PetscMPIIntCast(ranks[i],sf->ranks+i);CHKERRQ(ierr);
1072     sf->roffset[i+1] = sf->roffset[i] + rcount[i];
1073     rcount[i]        = 0;
1074   }
1075   for (i=0, irank = -1, orank = -1; i<sf->nleaves; i++) {
1076     /* short circuit */
1077     if (orank != sf->remote[i].rank) {
1078       /* Search for index of iremote[i].rank in sf->ranks */
1079       ierr = PetscFindMPIInt(sf->remote[i].rank,sf->ndranks,sf->ranks,&irank);CHKERRQ(ierr);
1080       if (irank < 0) {
1081         ierr = PetscFindMPIInt(sf->remote[i].rank,sf->nranks-sf->ndranks,sf->ranks+sf->ndranks,&irank);CHKERRQ(ierr);
1082         if (irank >= 0) irank += sf->ndranks;
1083       }
1084       orank = sf->remote[i].rank;
1085     }
1086     if (irank < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Could not find rank %" PetscInt_FMT " in array",sf->remote[i].rank);
1087     sf->rmine[sf->roffset[irank] + rcount[irank]]   = sf->mine ? sf->mine[i] : i;
1088     sf->rremote[sf->roffset[irank] + rcount[irank]] = sf->remote[i].index;
1089     rcount[irank]++;
1090   }
1091   ierr = PetscFree2(rcount,ranks);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 /*@C
1096    PetscSFGetGroups - gets incoming and outgoing process groups
1097 
1098    Collective
1099 
1100    Input Parameter:
1101 .  sf - star forest
1102 
1103    Output Parameters:
1104 +  incoming - group of origin processes for incoming edges (leaves that reference my roots)
1105 -  outgoing - group of destination processes for outgoing edges (roots that I reference)
1106 
1107    Level: developer
1108 
1109 .seealso: PetscSFGetWindow(), PetscSFRestoreWindow()
1110 @*/
1111 PetscErrorCode PetscSFGetGroups(PetscSF sf,MPI_Group *incoming,MPI_Group *outgoing)
1112 {
1113   PetscErrorCode ierr;
1114   MPI_Group      group = MPI_GROUP_NULL;
1115 
1116   PetscFunctionBegin;
1117   if (sf->nranks < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must call PetscSFSetUpRanks() before obtaining groups");
1118   if (sf->ingroup == MPI_GROUP_NULL) {
1119     PetscInt       i;
1120     const PetscInt *indegree;
1121     PetscMPIInt    rank,*outranks,*inranks;
1122     PetscSFNode    *remote;
1123     PetscSF        bgcount;
1124 
1125     /* Compute the number of incoming ranks */
1126     ierr = PetscMalloc1(sf->nranks,&remote);CHKERRQ(ierr);
1127     for (i=0; i<sf->nranks; i++) {
1128       remote[i].rank  = sf->ranks[i];
1129       remote[i].index = 0;
1130     }
1131     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&bgcount);CHKERRQ(ierr);
1132     ierr = PetscSFSetGraph(bgcount,1,sf->nranks,NULL,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1133     ierr = PetscSFComputeDegreeBegin(bgcount,&indegree);CHKERRQ(ierr);
1134     ierr = PetscSFComputeDegreeEnd(bgcount,&indegree);CHKERRQ(ierr);
1135     /* Enumerate the incoming ranks */
1136     ierr = PetscMalloc2(indegree[0],&inranks,sf->nranks,&outranks);CHKERRQ(ierr);
1137     ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1138     for (i=0; i<sf->nranks; i++) outranks[i] = rank;
1139     ierr = PetscSFGatherBegin(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1140     ierr = PetscSFGatherEnd(bgcount,MPI_INT,outranks,inranks);CHKERRQ(ierr);
1141     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1142     ierr = MPI_Group_incl(group,indegree[0],inranks,&sf->ingroup);CHKERRMPI(ierr);
1143     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1144     ierr = PetscFree2(inranks,outranks);CHKERRQ(ierr);
1145     ierr = PetscSFDestroy(&bgcount);CHKERRQ(ierr);
1146   }
1147   *incoming = sf->ingroup;
1148 
1149   if (sf->outgroup == MPI_GROUP_NULL) {
1150     ierr = MPI_Comm_group(PetscObjectComm((PetscObject)sf),&group);CHKERRMPI(ierr);
1151     ierr = MPI_Group_incl(group,sf->nranks,sf->ranks,&sf->outgroup);CHKERRMPI(ierr);
1152     ierr = MPI_Group_free(&group);CHKERRMPI(ierr);
1153   }
1154   *outgoing = sf->outgroup;
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 /*@
1159    PetscSFGetMultiSF - gets the inner SF implementing gathers and scatters
1160 
1161    Collective
1162 
1163    Input Parameter:
1164 .  sf - star forest that may contain roots with 0 or with more than 1 vertex
1165 
1166    Output Parameter:
1167 .  multi - star forest with split roots, such that each root has degree exactly 1
1168 
1169    Level: developer
1170 
1171    Notes:
1172 
1173    In most cases, users should use PetscSFGatherBegin() and PetscSFScatterBegin() instead of manipulating multi
1174    directly. Since multi satisfies the stronger condition that each entry in the global space has exactly one incoming
1175    edge, it is a candidate for future optimization that might involve its removal.
1176 
1177 .seealso: PetscSFSetGraph(), PetscSFGatherBegin(), PetscSFScatterBegin(), PetscSFComputeMultiRootOriginalNumbering()
1178 @*/
1179 PetscErrorCode PetscSFGetMultiSF(PetscSF sf,PetscSF *multi)
1180 {
1181   PetscErrorCode ierr;
1182 
1183   PetscFunctionBegin;
1184   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1185   PetscValidPointer(multi,2);
1186   if (sf->nroots < 0) {         /* Graph has not been set yet; why do we need this? */
1187     ierr   = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1188     *multi = sf->multi;
1189     sf->multi->multi = sf->multi;
1190     PetscFunctionReturn(0);
1191   }
1192   if (!sf->multi) {
1193     const PetscInt *indegree;
1194     PetscInt       i,*inoffset,*outones,*outoffset,maxlocal;
1195     PetscSFNode    *remote;
1196     maxlocal = sf->maxleaf+1; /* TODO: We should use PetscSFGetLeafRange() */
1197     ierr = PetscSFComputeDegreeBegin(sf,&indegree);CHKERRQ(ierr);
1198     ierr = PetscSFComputeDegreeEnd(sf,&indegree);CHKERRQ(ierr);
1199     ierr = PetscMalloc3(sf->nroots+1,&inoffset,maxlocal,&outones,maxlocal,&outoffset);CHKERRQ(ierr);
1200     inoffset[0] = 0;
1201     for (i=0; i<sf->nroots; i++) inoffset[i+1] = inoffset[i] + indegree[i];
1202     for (i=0; i<maxlocal; i++) outones[i] = 1;
1203     ierr = PetscSFFetchAndOpBegin(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1204     ierr = PetscSFFetchAndOpEnd(sf,MPIU_INT,inoffset,outones,outoffset,MPI_SUM);CHKERRQ(ierr);
1205     for (i=0; i<sf->nroots; i++) inoffset[i] -= indegree[i]; /* Undo the increment */
1206     if (PetscDefined(USE_DEBUG)) { /* Check that the expected number of increments occurred */
1207       for (i=0; i<sf->nroots; i++) {
1208         if (inoffset[i] + indegree[i] != inoffset[i+1]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Incorrect result after PetscSFFetchAndOp");
1209       }
1210     }
1211     ierr = PetscMalloc1(sf->nleaves,&remote);CHKERRQ(ierr);
1212     for (i=0; i<sf->nleaves; i++) {
1213       remote[i].rank  = sf->remote[i].rank;
1214       remote[i].index = outoffset[sf->mine ? sf->mine[i] : i];
1215     }
1216     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_RANKS,&sf->multi);CHKERRQ(ierr);
1217     sf->multi->multi = sf->multi;
1218     ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,remote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1219     if (sf->rankorder) {        /* Sort the ranks */
1220       PetscMPIInt rank;
1221       PetscInt    *inranks,*newoffset,*outranks,*newoutoffset,*tmpoffset,maxdegree;
1222       PetscSFNode *newremote;
1223       ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)sf),&rank);CHKERRMPI(ierr);
1224       for (i=0,maxdegree=0; i<sf->nroots; i++) maxdegree = PetscMax(maxdegree,indegree[i]);
1225       ierr = PetscMalloc5(sf->multi->nroots,&inranks,sf->multi->nroots,&newoffset,maxlocal,&outranks,maxlocal,&newoutoffset,maxdegree,&tmpoffset);CHKERRQ(ierr);
1226       for (i=0; i<maxlocal; i++) outranks[i] = rank;
1227       ierr = PetscSFReduceBegin(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1228       ierr = PetscSFReduceEnd(sf->multi,MPIU_INT,outranks,inranks,MPI_REPLACE);CHKERRQ(ierr);
1229       /* Sort the incoming ranks at each vertex, build the inverse map */
1230       for (i=0; i<sf->nroots; i++) {
1231         PetscInt j;
1232         for (j=0; j<indegree[i]; j++) tmpoffset[j] = j;
1233         ierr = PetscSortIntWithArray(indegree[i],inranks+inoffset[i],tmpoffset);CHKERRQ(ierr);
1234         for (j=0; j<indegree[i]; j++) newoffset[inoffset[i] + tmpoffset[j]] = inoffset[i] + j;
1235       }
1236       ierr = PetscSFBcastBegin(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1237       ierr = PetscSFBcastEnd(sf->multi,MPIU_INT,newoffset,newoutoffset,MPI_REPLACE);CHKERRQ(ierr);
1238       ierr = PetscMalloc1(sf->nleaves,&newremote);CHKERRQ(ierr);
1239       for (i=0; i<sf->nleaves; i++) {
1240         newremote[i].rank  = sf->remote[i].rank;
1241         newremote[i].index = newoutoffset[sf->mine ? sf->mine[i] : i];
1242       }
1243       ierr = PetscSFSetGraph(sf->multi,inoffset[sf->nroots],sf->nleaves,sf->mine,PETSC_COPY_VALUES,newremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1244       ierr = PetscFree5(inranks,newoffset,outranks,newoutoffset,tmpoffset);CHKERRQ(ierr);
1245     }
1246     ierr = PetscFree3(inoffset,outones,outoffset);CHKERRQ(ierr);
1247   }
1248   *multi = sf->multi;
1249   PetscFunctionReturn(0);
1250 }
1251 
1252 /*@C
1253    PetscSFCreateEmbeddedRootSF - removes edges from all but the selected roots, does not remap indices
1254 
1255    Collective
1256 
1257    Input Parameters:
1258 +  sf - original star forest
1259 .  nselected  - number of selected roots on this process
1260 -  selected   - indices of the selected roots on this process
1261 
1262    Output Parameter:
1263 .  esf - new star forest
1264 
1265    Level: advanced
1266 
1267    Note:
1268    To use the new PetscSF, it may be necessary to know the indices of the leaves that are still participating. This can
1269    be done by calling PetscSFGetGraph().
1270 
1271 .seealso: PetscSFSetGraph(), PetscSFGetGraph()
1272 @*/
1273 PetscErrorCode PetscSFCreateEmbeddedRootSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *esf)
1274 {
1275   PetscInt          i,j,n,nroots,nleaves,esf_nleaves,*new_ilocal,minleaf,maxleaf,maxlocal;
1276   const PetscInt    *ilocal;
1277   signed char       *rootdata,*leafdata,*leafmem;
1278   const PetscSFNode *iremote;
1279   PetscSFNode       *new_iremote;
1280   MPI_Comm          comm;
1281   PetscErrorCode    ierr;
1282 
1283   PetscFunctionBegin;
1284   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1285   PetscSFCheckGraphSet(sf,1);
1286   if (nselected) PetscValidPointer(selected,3);
1287   PetscValidPointer(esf,4);
1288 
1289   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1290   ierr = PetscLogEventBegin(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1291   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1292   ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
1293 
1294   if (PetscDefined(USE_DEBUG)) {  /* Error out if selected[] has dups or  out of range indices */
1295     PetscBool dups;
1296     ierr = PetscCheckDupsInt(nselected,selected,&dups);CHKERRQ(ierr);
1297     if (dups) SETERRQ(comm,PETSC_ERR_ARG_WRONG,"selected[] has dups");
1298     for (i=0; i<nselected; i++)
1299       if (selected[i] < 0 || selected[i] >= nroots) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"selected root indice %" PetscInt_FMT " is out of [0,%" PetscInt_FMT ")",selected[i],nroots);
1300   }
1301 
1302   if (sf->ops->CreateEmbeddedRootSF) {
1303     ierr = (*sf->ops->CreateEmbeddedRootSF)(sf,nselected,selected,esf);CHKERRQ(ierr);
1304   } else {
1305     /* A generic version of creating embedded sf */
1306     ierr = PetscSFGetLeafRange(sf,&minleaf,&maxleaf);CHKERRQ(ierr);
1307     maxlocal = maxleaf - minleaf + 1;
1308     ierr = PetscCalloc2(nroots,&rootdata,maxlocal,&leafmem);CHKERRQ(ierr);
1309     leafdata = leafmem - minleaf;
1310     /* Tag selected roots and bcast to leaves */
1311     for (i=0; i<nselected; i++) rootdata[selected[i]] = 1;
1312     ierr = PetscSFBcastBegin(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1313     ierr = PetscSFBcastEnd(sf,MPI_SIGNED_CHAR,rootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1314 
1315     /* Build esf with leaves that are still connected */
1316     esf_nleaves = 0;
1317     for (i=0; i<nleaves; i++) {
1318       j = ilocal ? ilocal[i] : i;
1319       /* esf_nleaves += leafdata[j] should work in theory, but failed with SFWindow bugs
1320          with PetscSFBcast. See https://gitlab.com/petsc/petsc/issues/555
1321       */
1322       esf_nleaves += (leafdata[j] ? 1 : 0);
1323     }
1324     ierr = PetscMalloc1(esf_nleaves,&new_ilocal);CHKERRQ(ierr);
1325     ierr = PetscMalloc1(esf_nleaves,&new_iremote);CHKERRQ(ierr);
1326     for (i=n=0; i<nleaves; i++) {
1327       j = ilocal ? ilocal[i] : i;
1328       if (leafdata[j]) {
1329         new_ilocal[n]        = j;
1330         new_iremote[n].rank  = iremote[i].rank;
1331         new_iremote[n].index = iremote[i].index;
1332         ++n;
1333       }
1334     }
1335     ierr = PetscSFCreate(comm,esf);CHKERRQ(ierr);
1336     ierr = PetscSFSetFromOptions(*esf);CHKERRQ(ierr);
1337     ierr = PetscSFSetGraph(*esf,nroots,esf_nleaves,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1338     ierr = PetscFree2(rootdata,leafmem);CHKERRQ(ierr);
1339   }
1340   ierr = PetscLogEventEnd(PETSCSF_EmbedSF,sf,0,0,0);CHKERRQ(ierr);
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 /*@C
1345   PetscSFCreateEmbeddedLeafSF - removes edges from all but the selected leaves, does not remap indices
1346 
1347   Collective
1348 
1349   Input Parameters:
1350 + sf - original star forest
1351 . nselected  - number of selected leaves on this process
1352 - selected   - indices of the selected leaves on this process
1353 
1354   Output Parameter:
1355 .  newsf - new star forest
1356 
1357   Level: advanced
1358 
1359 .seealso: PetscSFCreateEmbeddedRootSF(), PetscSFSetGraph(), PetscSFGetGraph()
1360 @*/
1361 PetscErrorCode PetscSFCreateEmbeddedLeafSF(PetscSF sf,PetscInt nselected,const PetscInt *selected,PetscSF *newsf)
1362 {
1363   const PetscSFNode *iremote;
1364   PetscSFNode       *new_iremote;
1365   const PetscInt    *ilocal;
1366   PetscInt          i,nroots,*leaves,*new_ilocal;
1367   MPI_Comm          comm;
1368   PetscErrorCode    ierr;
1369 
1370   PetscFunctionBegin;
1371   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1372   PetscSFCheckGraphSet(sf,1);
1373   if (nselected) PetscValidPointer(selected,3);
1374   PetscValidPointer(newsf,4);
1375 
1376   /* Uniq selected[] and put results in leaves[] */
1377   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
1378   ierr = PetscMalloc1(nselected,&leaves);CHKERRQ(ierr);
1379   ierr = PetscArraycpy(leaves,selected,nselected);CHKERRQ(ierr);
1380   ierr = PetscSortedRemoveDupsInt(&nselected,leaves);CHKERRQ(ierr);
1381   if (nselected && (leaves[0] < 0 || leaves[nselected-1] >= sf->nleaves)) SETERRQ3(comm,PETSC_ERR_ARG_OUTOFRANGE,"Min/Max leaf indices %" PetscInt_FMT "/%" PetscInt_FMT " are not in [0,%" PetscInt_FMT ")",leaves[0],leaves[nselected-1],sf->nleaves);
1382 
1383   /* Optimize the routine only when sf is setup and hence we can reuse sf's communication pattern */
1384   if (sf->setupcalled && sf->ops->CreateEmbeddedLeafSF) {
1385     ierr = (*sf->ops->CreateEmbeddedLeafSF)(sf,nselected,leaves,newsf);CHKERRQ(ierr);
1386   } else {
1387     ierr = PetscSFGetGraph(sf,&nroots,NULL,&ilocal,&iremote);CHKERRQ(ierr);
1388     ierr = PetscMalloc1(nselected,&new_ilocal);CHKERRQ(ierr);
1389     ierr = PetscMalloc1(nselected,&new_iremote);CHKERRQ(ierr);
1390     for (i=0; i<nselected; ++i) {
1391       const PetscInt l     = leaves[i];
1392       new_ilocal[i]        = ilocal ? ilocal[l] : l;
1393       new_iremote[i].rank  = iremote[l].rank;
1394       new_iremote[i].index = iremote[l].index;
1395     }
1396     ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,newsf);CHKERRQ(ierr);
1397     ierr = PetscSFSetGraph(*newsf,nroots,nselected,new_ilocal,PETSC_OWN_POINTER,new_iremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
1398   }
1399   ierr = PetscFree(leaves);CHKERRQ(ierr);
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 /*@C
1404    PetscSFBcastBegin - begin pointwise broadcast with root value being reduced to leaf value, to be concluded with call to PetscSFBcastEnd()
1405 
1406    Collective on PetscSF
1407 
1408    Input Parameters:
1409 +  sf - star forest on which to communicate
1410 .  unit - data type associated with each node
1411 .  rootdata - buffer to broadcast
1412 -  op - operation to use for reduction
1413 
1414    Output Parameter:
1415 .  leafdata - buffer to be reduced with values from each leaf's respective root
1416 
1417    Level: intermediate
1418 
1419    Notes:
1420     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1421     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1422     use PetscSFBcastWithMemTypeBegin() instead.
1423 .seealso: PetscSFBcastEnd(), PetscSFBcastWithMemTypeBegin()
1424 @*/
1425 PetscErrorCode PetscSFBcastBegin(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1426 {
1427   PetscErrorCode ierr;
1428   PetscMemType   rootmtype,leafmtype;
1429 
1430   PetscFunctionBegin;
1431   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1432   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1433   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1434   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1435   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1436   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1437   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 /*@C
1442    PetscSFBcastWithMemTypeBegin - begin pointwise broadcast with root value being reduced to leaf value with explicit memory types, to be concluded with call to PetscSFBcastEnd()
1443 
1444    Collective on PetscSF
1445 
1446    Input Parameters:
1447 +  sf - star forest on which to communicate
1448 .  unit - data type associated with each node
1449 .  rootmtype - memory type of rootdata
1450 .  rootdata - buffer to broadcast
1451 .  leafmtype - memory type of leafdata
1452 -  op - operation to use for reduction
1453 
1454    Output Parameter:
1455 .  leafdata - buffer to be reduced with values from each leaf's respective root
1456 
1457    Level: intermediate
1458 
1459 .seealso: PetscSFBcastEnd(), PetscSFBcastBegin()
1460 @*/
1461 PetscErrorCode PetscSFBcastWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,void *leafdata,MPI_Op op)
1462 {
1463   PetscErrorCode ierr;
1464 
1465   PetscFunctionBegin;
1466   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1467   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1468   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1469   ierr = (*sf->ops->BcastBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op);CHKERRQ(ierr);
1470   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);}
1471   PetscFunctionReturn(0);
1472 }
1473 
1474 /*@C
1475    PetscSFBcastEnd - end a broadcast & reduce operation started with PetscSFBcastBegin()
1476 
1477    Collective
1478 
1479    Input Parameters:
1480 +  sf - star forest
1481 .  unit - data type
1482 .  rootdata - buffer to broadcast
1483 -  op - operation to use for reduction
1484 
1485    Output Parameter:
1486 .  leafdata - buffer to be reduced with values from each leaf's respective root
1487 
1488    Level: intermediate
1489 
1490 .seealso: PetscSFSetGraph(), PetscSFReduceEnd()
1491 @*/
1492 PetscErrorCode PetscSFBcastEnd(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata,MPI_Op op)
1493 {
1494   PetscErrorCode ierr;
1495 
1496   PetscFunctionBegin;
1497   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1498   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1499   ierr = (*sf->ops->BcastEnd)(sf,unit,rootdata,leafdata,op);CHKERRQ(ierr);
1500   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_BcastEnd,sf,0,0,0);CHKERRQ(ierr);}
1501   PetscFunctionReturn(0);
1502 }
1503 
1504 /*@C
1505    PetscSFReduceBegin - begin reduction of leafdata into rootdata, to be completed with call to PetscSFReduceEnd()
1506 
1507    Collective
1508 
1509    Input Parameters:
1510 +  sf - star forest
1511 .  unit - data type
1512 .  leafdata - values to reduce
1513 -  op - reduction operation
1514 
1515    Output Parameter:
1516 .  rootdata - result of reduction of values from all leaves of each root
1517 
1518    Level: intermediate
1519 
1520    Notes:
1521     When petsc is configured with device support, it will use its own mechanism to figure out whether the given data pointers
1522     are host pointers or device pointers, which may incur a noticable cost. If you already knew the info, you should
1523     use PetscSFReduceWithMemTypeBegin() instead.
1524 
1525 .seealso: PetscSFBcastBegin(), PetscSFReduceWithMemTypeBegin()
1526 @*/
1527 PetscErrorCode PetscSFReduceBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1528 {
1529   PetscErrorCode ierr;
1530   PetscMemType   rootmtype,leafmtype;
1531 
1532   PetscFunctionBegin;
1533   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1534   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1535   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1536   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1537   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1538   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1539   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 /*@C
1544    PetscSFReduceWithMemTypeBegin - begin reduction of leafdata into rootdata with explicit memory types, to be completed with call to PetscSFReduceEnd()
1545 
1546    Collective
1547 
1548    Input Parameters:
1549 +  sf - star forest
1550 .  unit - data type
1551 .  leafmtype - memory type of leafdata
1552 .  leafdata - values to reduce
1553 .  rootmtype - memory type of rootdata
1554 -  op - reduction operation
1555 
1556    Output Parameter:
1557 .  rootdata - result of reduction of values from all leaves of each root
1558 
1559    Level: intermediate
1560 
1561 .seealso: PetscSFBcastBegin(), PetscSFReduceBegin()
1562 @*/
1563 PetscErrorCode PetscSFReduceWithMemTypeBegin(PetscSF sf,MPI_Datatype unit,PetscMemType leafmtype,const void *leafdata,PetscMemType rootmtype,void *rootdata,MPI_Op op)
1564 {
1565   PetscErrorCode ierr;
1566 
1567   PetscFunctionBegin;
1568   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1569   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1570   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1571   ierr = (sf->ops->ReduceBegin)(sf,unit,leafmtype,leafdata,rootmtype,rootdata,op);CHKERRQ(ierr);
1572   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceBegin,sf,0,0,0);CHKERRQ(ierr);}
1573   PetscFunctionReturn(0);
1574 }
1575 
1576 /*@C
1577    PetscSFReduceEnd - end a reduction operation started with PetscSFReduceBegin()
1578 
1579    Collective
1580 
1581    Input Parameters:
1582 +  sf - star forest
1583 .  unit - data type
1584 .  leafdata - values to reduce
1585 -  op - reduction operation
1586 
1587    Output Parameter:
1588 .  rootdata - result of reduction of values from all leaves of each root
1589 
1590    Level: intermediate
1591 
1592 .seealso: PetscSFSetGraph(), PetscSFBcastEnd()
1593 @*/
1594 PetscErrorCode PetscSFReduceEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *rootdata,MPI_Op op)
1595 {
1596   PetscErrorCode ierr;
1597 
1598   PetscFunctionBegin;
1599   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1600   if (!sf->vscat.logging) {ierr = PetscLogEventBegin(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1601   ierr = (*sf->ops->ReduceEnd)(sf,unit,leafdata,rootdata,op);CHKERRQ(ierr);
1602   if (!sf->vscat.logging) {ierr = PetscLogEventEnd(PETSCSF_ReduceEnd,sf,0,0,0);CHKERRQ(ierr);}
1603   PetscFunctionReturn(0);
1604 }
1605 
1606 /*@C
1607    PetscSFFetchAndOpBegin - begin operation that fetches values from root and updates atomically by applying operation using my leaf value, to be completed with PetscSFFetchAndOpEnd()
1608 
1609    Collective
1610 
1611    Input Parameters:
1612 +  sf - star forest
1613 .  unit - data type
1614 .  leafdata - leaf values to use in reduction
1615 -  op - operation to use for reduction
1616 
1617    Output Parameters:
1618 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1619 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1620 
1621    Level: advanced
1622 
1623    Note:
1624    The update is only atomic at the granularity provided by the hardware. Different roots referenced by the same process
1625    might be updated in a different order. Furthermore, if a composite type is used for the unit datatype, atomicity is
1626    not guaranteed across the whole vertex. Therefore, this function is mostly only used with primitive types such as
1627    integers.
1628 
1629 .seealso: PetscSFComputeDegreeBegin(), PetscSFReduceBegin(), PetscSFSetGraph()
1630 @*/
1631 PetscErrorCode PetscSFFetchAndOpBegin(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1632 {
1633   PetscErrorCode ierr;
1634   PetscMemType   rootmtype,leafmtype,leafupdatemtype;
1635 
1636   PetscFunctionBegin;
1637   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1638   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1639   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1640   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
1641   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
1642   ierr = PetscGetMemType(leafupdate,&leafupdatemtype);CHKERRQ(ierr);
1643   if (leafmtype != leafupdatemtype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for leafdata and leafupdate in different memory types");
1644   ierr = (*sf->ops->FetchAndOpBegin)(sf,unit,rootmtype,rootdata,leafmtype,leafdata,leafupdate,op);CHKERRQ(ierr);
1645   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpBegin,sf,0,0,0);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 /*@C
1650    PetscSFFetchAndOpEnd - end operation started in matching call to PetscSFFetchAndOpBegin() to fetch values from roots and update atomically by applying operation using my leaf value
1651 
1652    Collective
1653 
1654    Input Parameters:
1655 +  sf - star forest
1656 .  unit - data type
1657 .  leafdata - leaf values to use in reduction
1658 -  op - operation to use for reduction
1659 
1660    Output Parameters:
1661 +  rootdata - root values to be updated, input state is seen by first process to perform an update
1662 -  leafupdate - state at each leaf's respective root immediately prior to my atomic update
1663 
1664    Level: advanced
1665 
1666 .seealso: PetscSFComputeDegreeEnd(), PetscSFReduceEnd(), PetscSFSetGraph()
1667 @*/
1668 PetscErrorCode PetscSFFetchAndOpEnd(PetscSF sf,MPI_Datatype unit,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1669 {
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1674   ierr = PetscLogEventBegin(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1675   ierr = (*sf->ops->FetchAndOpEnd)(sf,unit,rootdata,leafdata,leafupdate,op);CHKERRQ(ierr);
1676   ierr = PetscLogEventEnd(PETSCSF_FetchAndOpEnd,sf,0,0,0);CHKERRQ(ierr);
1677   PetscFunctionReturn(0);
1678 }
1679 
1680 /*@C
1681    PetscSFComputeDegreeBegin - begin computation of degree for each root vertex, to be completed with PetscSFComputeDegreeEnd()
1682 
1683    Collective
1684 
1685    Input Parameter:
1686 .  sf - star forest
1687 
1688    Output Parameter:
1689 .  degree - degree of each root vertex
1690 
1691    Level: advanced
1692 
1693    Notes:
1694    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1695 
1696 .seealso: PetscSFGatherBegin()
1697 @*/
1698 PetscErrorCode PetscSFComputeDegreeBegin(PetscSF sf,const PetscInt **degree)
1699 {
1700   PetscErrorCode ierr;
1701 
1702   PetscFunctionBegin;
1703   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1704   PetscSFCheckGraphSet(sf,1);
1705   PetscValidPointer(degree,2);
1706   if (!sf->degreeknown) {
1707     PetscInt i, nroots = sf->nroots, maxlocal;
1708     if (sf->degree) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Calls to PetscSFComputeDegreeBegin() cannot be nested.");
1709     maxlocal = sf->maxleaf-sf->minleaf+1;
1710     ierr = PetscMalloc1(nroots,&sf->degree);CHKERRQ(ierr);
1711     ierr = PetscMalloc1(PetscMax(maxlocal,1),&sf->degreetmp);CHKERRQ(ierr); /* allocate at least one entry, see check in PetscSFComputeDegreeEnd() */
1712     for (i=0; i<nroots; i++) sf->degree[i] = 0;
1713     for (i=0; i<maxlocal; i++) sf->degreetmp[i] = 1;
1714     ierr = PetscSFReduceBegin(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1715   }
1716   *degree = NULL;
1717   PetscFunctionReturn(0);
1718 }
1719 
1720 /*@C
1721    PetscSFComputeDegreeEnd - complete computation of degree for each root vertex, started with PetscSFComputeDegreeBegin()
1722 
1723    Collective
1724 
1725    Input Parameter:
1726 .  sf - star forest
1727 
1728    Output Parameter:
1729 .  degree - degree of each root vertex
1730 
1731    Level: developer
1732 
1733    Notes:
1734    The returned array is owned by PetscSF and automatically freed by PetscSFDestroy(). Hence no need to call PetscFree() on it.
1735 
1736 .seealso:
1737 @*/
1738 PetscErrorCode PetscSFComputeDegreeEnd(PetscSF sf,const PetscInt **degree)
1739 {
1740   PetscErrorCode ierr;
1741 
1742   PetscFunctionBegin;
1743   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1744   PetscSFCheckGraphSet(sf,1);
1745   PetscValidPointer(degree,2);
1746   if (!sf->degreeknown) {
1747     if (!sf->degreetmp) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Must call PetscSFComputeDegreeBegin() before PetscSFComputeDegreeEnd()");
1748     ierr = PetscSFReduceEnd(sf,MPIU_INT,sf->degreetmp-sf->minleaf,sf->degree,MPI_SUM);CHKERRQ(ierr);
1749     ierr = PetscFree(sf->degreetmp);CHKERRQ(ierr);
1750     sf->degreeknown = PETSC_TRUE;
1751   }
1752   *degree = sf->degree;
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 /*@C
1757    PetscSFComputeMultiRootOriginalNumbering - Returns original numbering of multi-roots (roots of multi-SF returned by PetscSFGetMultiSF()).
1758    Each multi-root is assigned index of the corresponding original root.
1759 
1760    Collective
1761 
1762    Input Parameters:
1763 +  sf - star forest
1764 -  degree - degree of each root vertex, computed with PetscSFComputeDegreeBegin()/PetscSFComputeDegreeEnd()
1765 
1766    Output Parameters:
1767 +  nMultiRoots - (optional) number of multi-roots (roots of multi-SF)
1768 -  multiRootsOrigNumbering - original indices of multi-roots; length of this array is nMultiRoots
1769 
1770    Level: developer
1771 
1772    Notes:
1773    The returned array multiRootsOrigNumbering is newly allocated and should be destroyed with PetscFree() when no longer needed.
1774 
1775 .seealso: PetscSFComputeDegreeBegin(), PetscSFComputeDegreeEnd(), PetscSFGetMultiSF()
1776 @*/
1777 PetscErrorCode PetscSFComputeMultiRootOriginalNumbering(PetscSF sf, const PetscInt degree[], PetscInt *nMultiRoots, PetscInt *multiRootsOrigNumbering[])
1778 {
1779   PetscSF             msf;
1780   PetscInt            i, j, k, nroots, nmroots;
1781   PetscErrorCode      ierr;
1782 
1783   PetscFunctionBegin;
1784   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1785   ierr = PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL);CHKERRQ(ierr);
1786   if (nroots) PetscValidIntPointer(degree,2);
1787   if (nMultiRoots) PetscValidIntPointer(nMultiRoots,3);
1788   PetscValidPointer(multiRootsOrigNumbering,4);
1789   ierr = PetscSFGetMultiSF(sf,&msf);CHKERRQ(ierr);
1790   ierr = PetscSFGetGraph(msf, &nmroots, NULL, NULL, NULL);CHKERRQ(ierr);
1791   ierr = PetscMalloc1(nmroots, multiRootsOrigNumbering);CHKERRQ(ierr);
1792   for (i=0,j=0,k=0; i<nroots; i++) {
1793     if (!degree[i]) continue;
1794     for (j=0; j<degree[i]; j++,k++) {
1795       (*multiRootsOrigNumbering)[k] = i;
1796     }
1797   }
1798   if (PetscUnlikely(k != nmroots)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"sanity check fail");
1799   if (nMultiRoots) *nMultiRoots = nmroots;
1800   PetscFunctionReturn(0);
1801 }
1802 
1803 /*@C
1804    PetscSFGatherBegin - begin pointwise gather of all leaves into multi-roots, to be completed with PetscSFGatherEnd()
1805 
1806    Collective
1807 
1808    Input Parameters:
1809 +  sf - star forest
1810 .  unit - data type
1811 -  leafdata - leaf data to gather to roots
1812 
1813    Output Parameter:
1814 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1815 
1816    Level: intermediate
1817 
1818 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1819 @*/
1820 PetscErrorCode PetscSFGatherBegin(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1821 {
1822   PetscErrorCode ierr;
1823   PetscSF        multi = NULL;
1824 
1825   PetscFunctionBegin;
1826   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1827   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1828   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1829   ierr = PetscSFReduceBegin(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1830   PetscFunctionReturn(0);
1831 }
1832 
1833 /*@C
1834    PetscSFGatherEnd - ends pointwise gather operation that was started with PetscSFGatherBegin()
1835 
1836    Collective
1837 
1838    Input Parameters:
1839 +  sf - star forest
1840 .  unit - data type
1841 -  leafdata - leaf data to gather to roots
1842 
1843    Output Parameter:
1844 .  multirootdata - root buffer to gather into, amount of space per root is equal to its degree
1845 
1846    Level: intermediate
1847 
1848 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1849 @*/
1850 PetscErrorCode PetscSFGatherEnd(PetscSF sf,MPI_Datatype unit,const void *leafdata,void *multirootdata)
1851 {
1852   PetscErrorCode ierr;
1853   PetscSF        multi = NULL;
1854 
1855   PetscFunctionBegin;
1856   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1857   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1858   ierr = PetscSFReduceEnd(multi,unit,leafdata,multirootdata,MPI_REPLACE);CHKERRQ(ierr);
1859   PetscFunctionReturn(0);
1860 }
1861 
1862 /*@C
1863    PetscSFScatterBegin - begin pointwise scatter operation from multi-roots to leaves, to be completed with PetscSFScatterEnd()
1864 
1865    Collective
1866 
1867    Input Parameters:
1868 +  sf - star forest
1869 .  unit - data type
1870 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1871 
1872    Output Parameter:
1873 .  leafdata - leaf data to be update with personal data from each respective root
1874 
1875    Level: intermediate
1876 
1877 .seealso: PetscSFComputeDegreeBegin(), PetscSFScatterBegin()
1878 @*/
1879 PetscErrorCode PetscSFScatterBegin(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1880 {
1881   PetscErrorCode ierr;
1882   PetscSF        multi = NULL;
1883 
1884   PetscFunctionBegin;
1885   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1886   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
1887   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1888   ierr = PetscSFBcastBegin(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 /*@C
1893    PetscSFScatterEnd - ends pointwise scatter operation that was started with PetscSFScatterBegin()
1894 
1895    Collective
1896 
1897    Input Parameters:
1898 +  sf - star forest
1899 .  unit - data type
1900 -  multirootdata - root buffer to send to each leaf, one unit of data per leaf
1901 
1902    Output Parameter:
1903 .  leafdata - leaf data to be update with personal data from each respective root
1904 
1905    Level: intermediate
1906 
1907 .seealso: PetscSFComputeDegreeEnd(), PetscSFScatterEnd()
1908 @*/
1909 PetscErrorCode PetscSFScatterEnd(PetscSF sf,MPI_Datatype unit,const void *multirootdata,void *leafdata)
1910 {
1911   PetscErrorCode ierr;
1912   PetscSF        multi = NULL;
1913 
1914   PetscFunctionBegin;
1915   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
1916   ierr = PetscSFGetMultiSF(sf,&multi);CHKERRQ(ierr);
1917   ierr = PetscSFBcastEnd(multi,unit,multirootdata,leafdata,MPI_REPLACE);CHKERRQ(ierr);
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 static PetscErrorCode PetscSFCheckLeavesUnique_Private(PetscSF sf)
1922 {
1923   PetscInt        i, n, nleaves;
1924   const PetscInt *ilocal = NULL;
1925   PetscHSetI      seen;
1926   PetscErrorCode  ierr;
1927 
1928   PetscFunctionBegin;
1929   if (PetscDefined(USE_DEBUG)) {
1930     ierr = PetscSFGetGraph(sf,NULL,&nleaves,&ilocal,NULL);CHKERRQ(ierr);
1931     ierr = PetscHSetICreate(&seen);CHKERRQ(ierr);
1932     for (i = 0; i < nleaves; i++) {
1933       const PetscInt leaf = ilocal ? ilocal[i] : i;
1934       ierr = PetscHSetIAdd(seen,leaf);CHKERRQ(ierr);
1935     }
1936     ierr = PetscHSetIGetSize(seen,&n);CHKERRQ(ierr);
1937     if (n != nleaves) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Provided leaves have repeated values: all leaves must be unique");
1938     ierr = PetscHSetIDestroy(&seen);CHKERRQ(ierr);
1939   }
1940   PetscFunctionReturn(0);
1941 }
1942 
1943 /*@
1944   PetscSFCompose - Compose a new PetscSF by putting the second SF under the first one in a top (roots) down (leaves) view
1945 
1946   Input Parameters:
1947 + sfA - The first PetscSF
1948 - sfB - The second PetscSF
1949 
1950   Output Parameters:
1951 . sfBA - The composite SF
1952 
1953   Level: developer
1954 
1955   Notes:
1956   Currently, the two SFs must be defined on congruent communicators and they must be true star
1957   forests, i.e. the same leaf is not connected with different roots.
1958 
1959   sfA's leaf space and sfB's root space might be partially overlapped. The composition builds
1960   a graph with sfA's roots and sfB's leaves only when there is a path between them. Unconnected
1961   nodes (roots or leaves) are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a
1962   Bcast on sfA, then a Bcast on sfB, on connected nodes.
1963 
1964 .seealso: PetscSF, PetscSFComposeInverse(), PetscSFGetGraph(), PetscSFSetGraph()
1965 @*/
1966 PetscErrorCode PetscSFCompose(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
1967 {
1968   PetscErrorCode    ierr;
1969   const PetscSFNode *remotePointsA,*remotePointsB;
1970   PetscSFNode       *remotePointsBA=NULL,*reorderedRemotePointsA = NULL,*leafdataB;
1971   const PetscInt    *localPointsA,*localPointsB;
1972   PetscInt          *localPointsBA;
1973   PetscInt          i,numRootsA,numLeavesA,numRootsB,numLeavesB,minleaf,maxleaf,numLeavesBA;
1974   PetscBool         denseB;
1975 
1976   PetscFunctionBegin;
1977   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
1978   PetscSFCheckGraphSet(sfA,1);
1979   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
1980   PetscSFCheckGraphSet(sfB,2);
1981   PetscCheckSameComm(sfA,1,sfB,2);
1982   PetscValidPointer(sfBA,3);
1983   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
1984   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
1985 
1986   ierr = PetscSFGetGraph(sfA,&numRootsA,&numLeavesA,&localPointsA,&remotePointsA);CHKERRQ(ierr);
1987   ierr = PetscSFGetGraph(sfB,&numRootsB,&numLeavesB,&localPointsB,&remotePointsB);CHKERRQ(ierr);
1988   if (localPointsA) {
1989     ierr = PetscMalloc1(numRootsB,&reorderedRemotePointsA);CHKERRQ(ierr);
1990     for (i=0; i<numRootsB; i++) {
1991       reorderedRemotePointsA[i].rank = -1;
1992       reorderedRemotePointsA[i].index = -1;
1993     }
1994     for (i=0; i<numLeavesA; i++) {
1995       if (localPointsA[i] >= numRootsB) continue;
1996       reorderedRemotePointsA[localPointsA[i]] = remotePointsA[i];
1997     }
1998     remotePointsA = reorderedRemotePointsA;
1999   }
2000   ierr = PetscSFGetLeafRange(sfB,&minleaf,&maxleaf);CHKERRQ(ierr);
2001   ierr = PetscMalloc1(maxleaf-minleaf+1,&leafdataB);CHKERRQ(ierr);
2002   ierr = PetscSFBcastBegin(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2003   ierr = PetscSFBcastEnd(sfB,MPIU_2INT,remotePointsA,leafdataB-minleaf,MPI_REPLACE);CHKERRQ(ierr);
2004   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2005 
2006   denseB = (PetscBool)!localPointsB;
2007   for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2008     if (leafdataB[localPointsB ? localPointsB[i]-minleaf : i].rank == -1) denseB = PETSC_FALSE;
2009     else numLeavesBA++;
2010   }
2011   if (denseB) {
2012     localPointsBA  = NULL;
2013     remotePointsBA = leafdataB;
2014   } else {
2015     ierr = PetscMalloc1(numLeavesBA,&localPointsBA);CHKERRQ(ierr);
2016     ierr = PetscMalloc1(numLeavesBA,&remotePointsBA);CHKERRQ(ierr);
2017     for (i=0,numLeavesBA=0; i<numLeavesB; i++) {
2018       const PetscInt l = localPointsB ? localPointsB[i] : i;
2019 
2020       if (leafdataB[l-minleaf].rank == -1) continue;
2021       remotePointsBA[numLeavesBA] = leafdataB[l-minleaf];
2022       localPointsBA[numLeavesBA] = l;
2023       numLeavesBA++;
2024     }
2025     ierr = PetscFree(leafdataB);CHKERRQ(ierr);
2026   }
2027   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2028   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2029   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2030   PetscFunctionReturn(0);
2031 }
2032 
2033 /*@
2034   PetscSFComposeInverse - Compose a new PetscSF by putting the inverse of the second SF under the first one
2035 
2036   Input Parameters:
2037 + sfA - The first PetscSF
2038 - sfB - The second PetscSF
2039 
2040   Output Parameters:
2041 . sfBA - The composite SF.
2042 
2043   Level: developer
2044 
2045   Notes:
2046   Currently, the two SFs must be defined on congruent communicators and they must be true star
2047   forests, i.e. the same leaf is not connected with different roots. Even more, all roots of the
2048   second SF must have a degree of 1, i.e., no roots have more than one leaf connected.
2049 
2050   sfA's leaf space and sfB's leaf space might be partially overlapped. The composition builds
2051   a graph with sfA's roots and sfB's roots only when there is a path between them. Unconnected
2052   roots are not in sfBA. Doing a Bcast on the new SF is equivalent to doing a Bcast on sfA, then
2053   a Reduce on sfB, on connected roots.
2054 
2055 .seealso: PetscSF, PetscSFCompose(), PetscSFGetGraph(), PetscSFSetGraph(), PetscSFCreateInverseSF()
2056 @*/
2057 PetscErrorCode PetscSFComposeInverse(PetscSF sfA,PetscSF sfB,PetscSF *sfBA)
2058 {
2059   PetscErrorCode    ierr;
2060   const PetscSFNode *remotePointsA,*remotePointsB;
2061   PetscSFNode       *remotePointsBA;
2062   const PetscInt    *localPointsA,*localPointsB;
2063   PetscSFNode       *reorderedRemotePointsA = NULL;
2064   PetscInt          i,numRootsA,numLeavesA,numLeavesBA,numRootsB,numLeavesB,minleaf,maxleaf,*localPointsBA;
2065   MPI_Op            op;
2066 #if defined(PETSC_USE_64BIT_INDICES)
2067   PetscBool         iswin;
2068 #endif
2069 
2070   PetscFunctionBegin;
2071   PetscValidHeaderSpecific(sfA,PETSCSF_CLASSID,1);
2072   PetscSFCheckGraphSet(sfA,1);
2073   PetscValidHeaderSpecific(sfB,PETSCSF_CLASSID,2);
2074   PetscSFCheckGraphSet(sfB,2);
2075   PetscCheckSameComm(sfA,1,sfB,2);
2076   PetscValidPointer(sfBA,3);
2077   ierr = PetscSFCheckLeavesUnique_Private(sfA);CHKERRQ(ierr);
2078   ierr = PetscSFCheckLeavesUnique_Private(sfB);CHKERRQ(ierr);
2079 
2080   ierr = PetscSFGetGraph(sfA, &numRootsA, &numLeavesA, &localPointsA, &remotePointsA);CHKERRQ(ierr);
2081   ierr = PetscSFGetGraph(sfB, &numRootsB, &numLeavesB, &localPointsB, &remotePointsB);CHKERRQ(ierr);
2082 
2083   /* TODO: Check roots of sfB have degree of 1 */
2084   /* Once we implement it, we can replace the MPI_MAXLOC
2085      with MPI_REPLACE. In that case, MPI_MAXLOC and MPI_REPLACE have the same effect.
2086      We use MPI_MAXLOC only to have a deterministic output from this routine if
2087      the root condition is not meet.
2088    */
2089   op = MPI_MAXLOC;
2090 #if defined(PETSC_USE_64BIT_INDICES)
2091   /* we accept a non-deterministic output (if any) with PETSCSFWINDOW, since MPI_MAXLOC cannot operate on MPIU_2INT with MPI_Accumulate */
2092   ierr = PetscObjectTypeCompare((PetscObject)sfB,PETSCSFWINDOW,&iswin);CHKERRQ(ierr);
2093   if (iswin) op = MPI_REPLACE;
2094 #endif
2095 
2096   ierr = PetscSFGetLeafRange(sfB, &minleaf, &maxleaf);CHKERRQ(ierr);
2097   ierr = PetscMalloc1(maxleaf - minleaf + 1,&reorderedRemotePointsA);CHKERRQ(ierr);
2098   for (i=0; i<maxleaf - minleaf + 1; i++) {
2099     reorderedRemotePointsA[i].rank = -1;
2100     reorderedRemotePointsA[i].index = -1;
2101   }
2102   if (localPointsA) {
2103     for (i=0; i<numLeavesA; i++) {
2104       if (localPointsA[i] > maxleaf || localPointsA[i] < minleaf) continue;
2105       reorderedRemotePointsA[localPointsA[i] - minleaf] = remotePointsA[i];
2106     }
2107   } else {
2108     for (i=0; i<numLeavesA; i++) {
2109       if (i > maxleaf || i < minleaf) continue;
2110       reorderedRemotePointsA[i - minleaf] = remotePointsA[i];
2111     }
2112   }
2113 
2114   ierr = PetscMalloc1(numRootsB,&localPointsBA);CHKERRQ(ierr);
2115   ierr = PetscMalloc1(numRootsB,&remotePointsBA);CHKERRQ(ierr);
2116   for (i=0; i<numRootsB; i++) {
2117     remotePointsBA[i].rank = -1;
2118     remotePointsBA[i].index = -1;
2119   }
2120 
2121   ierr = PetscSFReduceBegin(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2122   ierr = PetscSFReduceEnd(sfB,MPIU_2INT,reorderedRemotePointsA-minleaf,remotePointsBA,op);CHKERRQ(ierr);
2123   ierr = PetscFree(reorderedRemotePointsA);CHKERRQ(ierr);
2124   for (i=0,numLeavesBA=0; i<numRootsB; i++) {
2125     if (remotePointsBA[i].rank == -1) continue;
2126     remotePointsBA[numLeavesBA].rank = remotePointsBA[i].rank;
2127     remotePointsBA[numLeavesBA].index = remotePointsBA[i].index;
2128     localPointsBA[numLeavesBA] = i;
2129     numLeavesBA++;
2130   }
2131   ierr = PetscSFCreate(PetscObjectComm((PetscObject)sfA),sfBA);CHKERRQ(ierr);
2132   ierr = PetscSFSetFromOptions(*sfBA);CHKERRQ(ierr);
2133   ierr = PetscSFSetGraph(*sfBA,numRootsA,numLeavesBA,localPointsBA,PETSC_OWN_POINTER,remotePointsBA,PETSC_OWN_POINTER);CHKERRQ(ierr);
2134   PetscFunctionReturn(0);
2135 }
2136 
2137 /*
2138   PetscSFCreateLocalSF_Private - Creates a local PetscSF that only has intra-process edges of the global PetscSF
2139 
2140   Input Parameters:
2141 . sf - The global PetscSF
2142 
2143   Output Parameters:
2144 . out - The local PetscSF
2145  */
2146 PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF sf,PetscSF *out)
2147 {
2148   MPI_Comm           comm;
2149   PetscMPIInt        myrank;
2150   const PetscInt     *ilocal;
2151   const PetscSFNode  *iremote;
2152   PetscInt           i,j,nroots,nleaves,lnleaves,*lilocal;
2153   PetscSFNode        *liremote;
2154   PetscSF            lsf;
2155   PetscErrorCode     ierr;
2156 
2157   PetscFunctionBegin;
2158   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2159   if (sf->ops->CreateLocalSF) {
2160     ierr = (*sf->ops->CreateLocalSF)(sf,out);CHKERRQ(ierr);
2161   } else {
2162     /* Could use PetscSFCreateEmbeddedLeafSF, but since we know the comm is PETSC_COMM_SELF, we can make it fast */
2163     ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
2164     ierr = MPI_Comm_rank(comm,&myrank);CHKERRMPI(ierr);
2165 
2166     /* Find out local edges and build a local SF */
2167     ierr = PetscSFGetGraph(sf,&nroots,&nleaves,&ilocal,&iremote);CHKERRQ(ierr);
2168     for (i=lnleaves=0; i<nleaves; i++) {if (iremote[i].rank == (PetscInt)myrank) lnleaves++;}
2169     ierr = PetscMalloc1(lnleaves,&lilocal);CHKERRQ(ierr);
2170     ierr = PetscMalloc1(lnleaves,&liremote);CHKERRQ(ierr);
2171 
2172     for (i=j=0; i<nleaves; i++) {
2173       if (iremote[i].rank == (PetscInt)myrank) {
2174         lilocal[j]        = ilocal? ilocal[i] : i; /* ilocal=NULL for contiguous storage */
2175         liremote[j].rank  = 0; /* rank in PETSC_COMM_SELF */
2176         liremote[j].index = iremote[i].index;
2177         j++;
2178       }
2179     }
2180     ierr = PetscSFCreate(PETSC_COMM_SELF,&lsf);CHKERRQ(ierr);
2181     ierr = PetscSFSetFromOptions(lsf);CHKERRQ(ierr);
2182     ierr = PetscSFSetGraph(lsf,nroots,lnleaves,lilocal,PETSC_OWN_POINTER,liremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
2183     ierr = PetscSFSetUp(lsf);CHKERRQ(ierr);
2184     *out = lsf;
2185   }
2186   PetscFunctionReturn(0);
2187 }
2188 
2189 /* Similar to PetscSFBcast, but only Bcast to leaves on rank 0 */
2190 PetscErrorCode PetscSFBcastToZero_Private(PetscSF sf,MPI_Datatype unit,const void *rootdata,void *leafdata)
2191 {
2192   PetscErrorCode     ierr;
2193   PetscMemType       rootmtype,leafmtype;
2194 
2195   PetscFunctionBegin;
2196   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
2197   ierr = PetscSFSetUp(sf);CHKERRQ(ierr);
2198   ierr = PetscLogEventBegin(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2199   ierr = PetscGetMemType(rootdata,&rootmtype);CHKERRQ(ierr);
2200   ierr = PetscGetMemType(leafdata,&leafmtype);CHKERRQ(ierr);
2201   if (sf->ops->BcastToZero) {
2202     ierr = (*sf->ops->BcastToZero)(sf,unit,rootmtype,rootdata,leafmtype,leafdata);CHKERRQ(ierr);
2203   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"PetscSFBcastToZero_Private is not supported on this SF type");
2204   ierr = PetscLogEventEnd(PETSCSF_BcastBegin,sf,0,0,0);CHKERRQ(ierr);
2205   PetscFunctionReturn(0);
2206 }
2207 
2208