#include <petsc/private/sfimpl.h> /*I  "petscsf.h"   I*/
#include <petsc/private/sectionimpl.h>

/*@
  PetscSFSetGraphLayout - Set a parallel star forest via global indices and a `PetscLayout`

  Collective

  Input Parameters:
+ sf        - star forest
. layout    - `PetscLayout` defining the global space for roots
. nleaves   - number of leaf vertices on the current process, each of these references a root on any process
. ilocal    - locations of leaves in leafdata buffers, pass NULL for contiguous storage
. localmode - copy mode for ilocal
- gremote   - root vertices in global numbering corresponding to leaves in ilocal

  Level: intermediate

  Note:
  Global indices must lie in [0, N) where N is the global size of layout.
  Leaf indices in ilocal get sorted; this means the user-provided array gets sorted if localmode is `PETSC_OWN_POINTER`.

  Developer Notes:
  Local indices which are the identity permutation in the range [0,nleaves) are discarded as they
  encode contiguous storage. In such case, if localmode is `PETSC_OWN_POINTER`, the memory is deallocated as it is not
  needed

.seealso: `PetscSF`, `PetscSFGetGraphLayout()`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
@*/
PetscErrorCode PetscSFSetGraphLayout(PetscSF sf, PetscLayout layout, PetscInt nleaves, PetscInt *ilocal, PetscCopyMode localmode, const PetscInt *gremote)
{
  const PetscInt *range;
  PetscInt        i, nroots, ls = -1, ln = -1;
  PetscMPIInt     lr = -1;
  PetscSFNode    *remote;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
  PetscCall(PetscLayoutSetUp(layout));
  PetscCall(PetscLayoutGetLocalSize(layout, &nroots));
  PetscCall(PetscLayoutGetRanges(layout, &range));
  PetscCall(PetscMalloc1(nleaves, &remote));
  if (nleaves) ls = gremote[0] + 1;
  for (i = 0; i < nleaves; i++) {
    const PetscInt idx = gremote[i] - ls;
    if (idx < 0 || idx >= ln) { /* short-circuit the search */
      PetscCall(PetscLayoutFindOwnerIndex(layout, gremote[i], &lr, &remote[i].index));
      remote[i].rank = lr;
      ls             = range[lr];
      ln             = range[lr + 1] - ls;
    } else {
      remote[i].rank  = lr;
      remote[i].index = idx;
    }
  }
  PetscCall(PetscSFSetGraph(sf, nroots, nleaves, ilocal, localmode, remote, PETSC_OWN_POINTER));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@C
  PetscSFGetGraphLayout - Get the global indices and `PetscLayout` that describe this star forest

  Collective

  Input Parameter:
. sf - star forest

  Output Parameters:
+ layout  - `PetscLayout` defining the global space for roots
. nleaves - number of leaf vertices on the current process, each of these references a root on any process
. ilocal  - locations of leaves in leafdata buffers, or `NULL` for contiguous storage
- gremote - root vertices in global numbering corresponding to leaves in ilocal

  Level: intermediate

  Notes:
  The outputs are such that passing them as inputs to `PetscSFSetGraphLayout()` would lead to the same star forest.
  The outputs `layout` and `gremote` are freshly created each time this function is called,
  so they need to be freed (with `PetscLayoutDestroy()` and `PetscFree()`) by the user.

.seealso: `PetscSF`, `PetscSFSetGraphLayout()`, `PetscSFCreate()`, `PetscSFView()`, `PetscSFSetGraph()`, `PetscSFGetGraph()`
@*/
PetscErrorCode PetscSFGetGraphLayout(PetscSF sf, PetscLayout *layout, PetscInt *nleaves, const PetscInt *ilocal[], PetscInt *gremote[])
{
  PetscInt           nr, nl;
  const PetscSFNode *ir;
  PetscLayout        lt;

  PetscFunctionBegin;
  PetscCall(PetscSFGetGraph(sf, &nr, &nl, ilocal, &ir));
  PetscCall(PetscLayoutCreateFromSizes(PetscObjectComm((PetscObject)sf), nr, PETSC_DECIDE, 1, &lt));
  if (gremote) {
    PetscInt        i;
    const PetscInt *range;
    PetscInt       *gr;

    PetscCall(PetscLayoutGetRanges(lt, &range));
    PetscCall(PetscMalloc1(nl, &gr));
    for (i = 0; i < nl; i++) gr[i] = range[ir[i].rank] + ir[i].index;
    *gremote = gr;
  }
  if (nleaves) *nleaves = nl;
  if (layout) *layout = lt;
  else PetscCall(PetscLayoutDestroy(&lt));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  PetscSFSetGraphSection - Sets the `PetscSF` graph encoding the parallel dof overlap based upon the `PetscSection` describing the data layout.

  Input Parameters:
+ sf            - The `PetscSF`
. localSection  - `PetscSection` describing the local data layout
- globalSection - `PetscSection` describing the global data layout

  Level: developer

.seealso: `PetscSF`, `PetscSFSetGraph()`, `PetscSFSetGraphLayout()`
@*/
PetscErrorCode PetscSFSetGraphSection(PetscSF sf, PetscSection localSection, PetscSection globalSection)
{
  MPI_Comm        comm;
  PetscLayout     layout;
  const PetscInt *ranges;
  PetscInt       *local;
  PetscSFNode    *remote;
  PetscInt        pStart, pEnd, p, nroots, nleaves = 0, l;
  PetscMPIInt     size, rank;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
  PetscValidHeaderSpecific(localSection, PETSC_SECTION_CLASSID, 2);
  PetscValidHeaderSpecific(globalSection, PETSC_SECTION_CLASSID, 3);

  PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
  PetscCallMPI(MPI_Comm_size(comm, &size));
  PetscCallMPI(MPI_Comm_rank(comm, &rank));
  PetscCall(PetscSectionGetChart(globalSection, &pStart, &pEnd));
  PetscCall(PetscSectionGetConstrainedStorageSize(globalSection, &nroots));
  PetscCall(PetscLayoutCreate(comm, &layout));
  PetscCall(PetscLayoutSetBlockSize(layout, 1));
  PetscCall(PetscLayoutSetLocalSize(layout, nroots));
  PetscCall(PetscLayoutSetUp(layout));
  PetscCall(PetscLayoutGetRanges(layout, &ranges));
  for (p = pStart; p < pEnd; ++p) {
    PetscInt gdof, gcdof;

    PetscCall(PetscSectionGetDof(globalSection, p, &gdof));
    PetscCall(PetscSectionGetConstraintDof(globalSection, p, &gcdof));
    PetscCheck(gcdof <= (gdof < 0 ? -(gdof + 1) : gdof), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Point %" PetscInt_FMT " has %" PetscInt_FMT " constraints > %" PetscInt_FMT " dof", p, gcdof, gdof < 0 ? -(gdof + 1) : gdof);
    nleaves += gdof < 0 ? -(gdof + 1) - gcdof : gdof - gcdof;
  }
  PetscCall(PetscMalloc1(nleaves, &local));
  PetscCall(PetscMalloc1(nleaves, &remote));
  for (p = pStart, l = 0; p < pEnd; ++p) {
    const PetscInt *cind;
    PetscInt        dof, cdof, off, gdof, gcdof, goff, gsize, d, c;

    PetscCall(PetscSectionGetDof(localSection, p, &dof));
    PetscCall(PetscSectionGetOffset(localSection, p, &off));
    PetscCall(PetscSectionGetConstraintDof(localSection, p, &cdof));
    PetscCall(PetscSectionGetConstraintIndices(localSection, p, &cind));
    PetscCall(PetscSectionGetDof(globalSection, p, &gdof));
    PetscCall(PetscSectionGetConstraintDof(globalSection, p, &gcdof));
    PetscCall(PetscSectionGetOffset(globalSection, p, &goff));
    if (!gdof) continue; /* Censored point */
    gsize = gdof < 0 ? -(gdof + 1) - gcdof : gdof - gcdof;
    if (gsize != dof - cdof) {
      PetscCheck(gsize == dof, comm, PETSC_ERR_ARG_WRONG, "Global dof %" PetscInt_FMT " for point %" PetscInt_FMT " is neither the constrained size %" PetscInt_FMT ", nor the unconstrained %" PetscInt_FMT, gsize, p, dof - cdof, dof);
      cdof = 0; /* Ignore constraints */
    }
    for (d = 0, c = 0; d < dof; ++d) {
      if ((c < cdof) && (cind[c] == d)) {
        ++c;
        continue;
      }
      local[l + d - c] = off + d;
    }
    PetscCheck(d - c == gsize, comm, PETSC_ERR_ARG_WRONG, "Point %" PetscInt_FMT ": Global dof %" PetscInt_FMT " != %" PetscInt_FMT " size - number of constraints", p, gsize, d - c);
    if (gdof < 0) {
      for (d = 0; d < gsize; ++d, ++l) {
        PetscInt    offset = -(goff + 1) + d, ir;
        PetscMPIInt r;

        PetscCall(PetscFindInt(offset, size + 1, ranges, &ir));
        PetscCall(PetscMPIIntCast(ir, &r));
        if (r < 0) r = -(r + 2);
        PetscCheck(!(r < 0) && !(r >= size), PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Point %" PetscInt_FMT " mapped to invalid process %d (%" PetscInt_FMT ", %" PetscInt_FMT ")", p, r, gdof, goff);
        remote[l].rank  = r;
        remote[l].index = offset - ranges[r];
      }
    } else {
      for (d = 0; d < gsize; ++d, ++l) {
        remote[l].rank  = rank;
        remote[l].index = goff + d - ranges[rank];
      }
    }
  }
  PetscCheck(l == nleaves, comm, PETSC_ERR_PLIB, "Iteration error, l %" PetscInt_FMT " != nleaves %" PetscInt_FMT, l, nleaves);
  PetscCall(PetscLayoutDestroy(&layout));
  PetscCall(PetscSFSetGraph(sf, nroots, nleaves, local, PETSC_OWN_POINTER, remote, PETSC_OWN_POINTER));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@C
  PetscSFDistributeSection - Create a new `PetscSection` reorganized, moving from the root to the leaves of the `PetscSF`

  Collective

  Input Parameters:
+ sf          - The `PetscSF`
- rootSection - Section defined on root space

  Output Parameters:
+ remoteOffsets - root offsets in leaf storage, or `NULL`, its length will be the size of the chart of `leafSection`
- leafSection   - Section defined on the leaf space

  Level: advanced

  Note:
  Caller must `PetscFree()` `remoteOffsets` if it was requested

  Fortran Note:
  Use `PetscSFDestroyRemoteOffsets()` when `remoteOffsets` is no longer needed.

.seealso: `PetscSF`, `PetscSFCreate()`
@*/
PetscErrorCode PetscSFDistributeSection(PetscSF sf, PetscSection rootSection, PetscInt *remoteOffsets[], PetscSection leafSection)
{
  PetscSF         embedSF;
  const PetscInt *indices;
  IS              selected;
  PetscInt        numFields, nroots, rpStart, rpEnd, lpStart = PETSC_INT_MAX, lpEnd = -1, f, c;
  PetscBool      *sub, hasc;

  PetscFunctionBegin;
  PetscCall(PetscLogEventBegin(PETSCSF_DistSect, sf, 0, 0, 0));
  PetscCall(PetscSectionGetNumFields(rootSection, &numFields));
  if (numFields) {
    IS perm;

    /* PetscSectionSetNumFields() calls PetscSectionReset(), which destroys
       leafSection->perm. To keep this permutation set by the user, we grab
       the reference before calling PetscSectionSetNumFields() and set it
       back after. */
    PetscCall(PetscSectionGetPermutation(leafSection, &perm));
    PetscCall(PetscObjectReference((PetscObject)perm));
    PetscCall(PetscSectionSetNumFields(leafSection, numFields));
    PetscCall(PetscSectionSetPermutation(leafSection, perm));
    PetscCall(ISDestroy(&perm));
  }
  PetscCall(PetscMalloc1(numFields + 2, &sub));
  sub[1] = rootSection->bc ? PETSC_TRUE : PETSC_FALSE;
  for (f = 0; f < numFields; ++f) {
    PetscSectionSym sym, dsym = NULL;
    const char     *name    = NULL;
    PetscInt        numComp = 0;

    sub[2 + f] = rootSection->field[f]->bc ? PETSC_TRUE : PETSC_FALSE;
    PetscCall(PetscSectionGetFieldComponents(rootSection, f, &numComp));
    PetscCall(PetscSectionGetFieldName(rootSection, f, &name));
    PetscCall(PetscSectionGetFieldSym(rootSection, f, &sym));
    if (sym) PetscCall(PetscSectionSymDistribute(sym, sf, &dsym));
    PetscCall(PetscSectionSetFieldComponents(leafSection, f, numComp));
    PetscCall(PetscSectionSetFieldName(leafSection, f, name));
    PetscCall(PetscSectionSetFieldSym(leafSection, f, dsym));
    PetscCall(PetscSectionSymDestroy(&dsym));
    for (c = 0; c < rootSection->numFieldComponents[f]; ++c) {
      PetscCall(PetscSectionGetComponentName(rootSection, f, c, &name));
      PetscCall(PetscSectionSetComponentName(leafSection, f, c, name));
    }
  }
  PetscCall(PetscSectionGetChart(rootSection, &rpStart, &rpEnd));
  PetscCall(PetscSFGetGraph(sf, &nroots, NULL, NULL, NULL));
  rpEnd = PetscMin(rpEnd, nroots);
  rpEnd = PetscMax(rpStart, rpEnd);
  /* see if we can avoid creating the embedded SF, since it can cost more than an allreduce */
  sub[0] = (PetscBool)(nroots != rpEnd - rpStart);
  PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, sub, 2 + numFields, MPI_C_BOOL, MPI_LOR, PetscObjectComm((PetscObject)sf)));
  if (sub[0]) {
    PetscCall(ISCreateStride(PETSC_COMM_SELF, rpEnd - rpStart, rpStart, 1, &selected));
    PetscCall(ISGetIndices(selected, &indices));
    PetscCall(PetscSFCreateEmbeddedRootSF(sf, rpEnd - rpStart, indices, &embedSF));
    PetscCall(ISRestoreIndices(selected, &indices));
    PetscCall(ISDestroy(&selected));
  } else {
    PetscCall(PetscObjectReference((PetscObject)sf));
    embedSF = sf;
  }
  PetscCall(PetscSFGetLeafRange(embedSF, &lpStart, &lpEnd));
  lpEnd++;

  PetscCall(PetscSectionSetChart(leafSection, lpStart, lpEnd));

  /* Constrained dof section */
  hasc = sub[1];
  for (f = 0; f < numFields; ++f) hasc = (PetscBool)(hasc || sub[2 + f]);

  /* Could fuse these at the cost of copies and extra allocation */
  PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->atlasDof, -rpStart), PetscSafePointerPlusOffset(leafSection->atlasDof, -lpStart), MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->atlasDof, -rpStart), PetscSafePointerPlusOffset(leafSection->atlasDof, -lpStart), MPI_REPLACE));
  if (sub[1]) {
    PetscCall(PetscSectionCheckConstraints_Private(rootSection));
    PetscCall(PetscSectionCheckConstraints_Private(leafSection));
    PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, &rootSection->bc->atlasDof[-rpStart], &leafSection->bc->atlasDof[-lpStart], MPI_REPLACE));
    PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, &rootSection->bc->atlasDof[-rpStart], &leafSection->bc->atlasDof[-lpStart], MPI_REPLACE));
  }
  for (f = 0; f < numFields; ++f) {
    PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->field[f]->atlasDof, -rpStart), PetscSafePointerPlusOffset(leafSection->field[f]->atlasDof, -lpStart), MPI_REPLACE));
    PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->field[f]->atlasDof, -rpStart), PetscSafePointerPlusOffset(leafSection->field[f]->atlasDof, -lpStart), MPI_REPLACE));
    if (sub[2 + f]) {
      PetscCall(PetscSectionCheckConstraints_Private(rootSection->field[f]));
      PetscCall(PetscSectionCheckConstraints_Private(leafSection->field[f]));
      PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, &rootSection->field[f]->bc->atlasDof[-rpStart], &leafSection->field[f]->bc->atlasDof[-lpStart], MPI_REPLACE));
      PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, &rootSection->field[f]->bc->atlasDof[-rpStart], &leafSection->field[f]->bc->atlasDof[-lpStart], MPI_REPLACE));
    }
  }
  if (remoteOffsets) {
    PetscCall(PetscMalloc1(lpEnd - lpStart, remoteOffsets));
    PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->atlasOff, -rpStart), PetscSafePointerPlusOffset(*remoteOffsets, -lpStart), MPI_REPLACE));
    PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->atlasOff, -rpStart), PetscSafePointerPlusOffset(*remoteOffsets, -lpStart), MPI_REPLACE));
  }
  PetscCall(PetscSectionInvalidateMaxDof_Internal(leafSection));
  PetscCall(PetscSectionSetUp(leafSection));
  if (hasc) { /* need to communicate bcIndices */
    PetscSF   bcSF;
    PetscInt *rOffBc;

    PetscCall(PetscMalloc1(lpEnd - lpStart, &rOffBc));
    if (sub[1]) {
      PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, &rootSection->bc->atlasOff[-rpStart], &rOffBc[-lpStart], MPI_REPLACE));
      PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, &rootSection->bc->atlasOff[-rpStart], &rOffBc[-lpStart], MPI_REPLACE));
      PetscCall(PetscSFCreateSectionSF(embedSF, rootSection->bc, rOffBc, leafSection->bc, &bcSF));
      PetscCall(PetscSFBcastBegin(bcSF, MPIU_INT, rootSection->bcIndices, leafSection->bcIndices, MPI_REPLACE));
      PetscCall(PetscSFBcastEnd(bcSF, MPIU_INT, rootSection->bcIndices, leafSection->bcIndices, MPI_REPLACE));
      PetscCall(PetscSFDestroy(&bcSF));
    }
    for (f = 0; f < numFields; ++f) {
      if (sub[2 + f]) {
        PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, &rootSection->field[f]->bc->atlasOff[-rpStart], &rOffBc[-lpStart], MPI_REPLACE));
        PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, &rootSection->field[f]->bc->atlasOff[-rpStart], &rOffBc[-lpStart], MPI_REPLACE));
        PetscCall(PetscSFCreateSectionSF(embedSF, rootSection->field[f]->bc, rOffBc, leafSection->field[f]->bc, &bcSF));
        PetscCall(PetscSFBcastBegin(bcSF, MPIU_INT, rootSection->field[f]->bcIndices, leafSection->field[f]->bcIndices, MPI_REPLACE));
        PetscCall(PetscSFBcastEnd(bcSF, MPIU_INT, rootSection->field[f]->bcIndices, leafSection->field[f]->bcIndices, MPI_REPLACE));
        PetscCall(PetscSFDestroy(&bcSF));
      }
    }
    PetscCall(PetscFree(rOffBc));
  }
  PetscCall(PetscSFDestroy(&embedSF));
  PetscCall(PetscFree(sub));
  PetscCall(PetscLogEventEnd(PETSCSF_DistSect, sf, 0, 0, 0));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@C
  PetscSFCreateRemoteOffsets - Create offsets for point data on remote processes

  Collective

  Input Parameters:
+ sf          - The `PetscSF`
. rootSection - Data layout of remote points for outgoing data (this is layout for roots)
- leafSection - Data layout of local points for incoming data  (this is layout for leaves)

  Output Parameter:
. remoteOffsets - Offsets for point data on remote processes (these are offsets from the root section), or `NULL`

  Level: developer

  Note:
  Caller must `PetscFree()` `remoteOffsets` if it was requested

  Fortran Note:
  Use `PetscSFDestroyRemoteOffsets()` when `remoteOffsets` is no longer needed.

.seealso: `PetscSF`, `PetscSFCreate()`
@*/
PetscErrorCode PetscSFCreateRemoteOffsets(PetscSF sf, PetscSection rootSection, PetscSection leafSection, PetscInt *remoteOffsets[])
{
  PetscSF         embedSF;
  const PetscInt *indices;
  IS              selected;
  PetscInt        numRoots, rpStart = 0, rpEnd = 0, lpStart = 0, lpEnd = 0;

  PetscFunctionBegin;
  *remoteOffsets = NULL;
  PetscCall(PetscSFGetGraph(sf, &numRoots, NULL, NULL, NULL));
  if (numRoots < 0) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(PetscLogEventBegin(PETSCSF_RemoteOff, sf, 0, 0, 0));
  PetscCall(PetscSectionGetChart(rootSection, &rpStart, &rpEnd));
  PetscCall(PetscSectionGetChart(leafSection, &lpStart, &lpEnd));
  PetscCall(ISCreateStride(PETSC_COMM_SELF, rpEnd - rpStart, rpStart, 1, &selected));
  PetscCall(ISGetIndices(selected, &indices));
  PetscCall(PetscSFCreateEmbeddedRootSF(sf, rpEnd - rpStart, indices, &embedSF));
  PetscCall(ISRestoreIndices(selected, &indices));
  PetscCall(ISDestroy(&selected));
  PetscCall(PetscCalloc1(lpEnd - lpStart, remoteOffsets));
  PetscCall(PetscSFBcastBegin(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->atlasOff, -rpStart), PetscSafePointerPlusOffset(*remoteOffsets, -lpStart), MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(embedSF, MPIU_INT, PetscSafePointerPlusOffset(rootSection->atlasOff, -rpStart), PetscSafePointerPlusOffset(*remoteOffsets, -lpStart), MPI_REPLACE));
  PetscCall(PetscSFDestroy(&embedSF));
  PetscCall(PetscLogEventEnd(PETSCSF_RemoteOff, sf, 0, 0, 0));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  PetscSFCreateSectionSF - Create an expanded `PetscSF` of dofs, assuming the input `PetscSF` relates points

  Collective

  Input Parameters:
+ sf            - The `PetscSF`
. rootSection   - Data layout of remote points for outgoing data (this is usually the serial section)
. remoteOffsets - Offsets for point data on remote processes (these are offsets from the root section), or NULL
- leafSection   - Data layout of local points for incoming data  (this is the distributed section)

  Output Parameter:
. sectionSF - The new `PetscSF`

  Level: advanced

  Notes:
  `remoteOffsets` can be `NULL` if `sf` does not reference any points in leafSection

.seealso: `PetscSF`, `PetscSFCreate()`
@*/
PetscErrorCode PetscSFCreateSectionSF(PetscSF sf, PetscSection rootSection, PetscInt remoteOffsets[], PetscSection leafSection, PetscSF *sectionSF)
{
  MPI_Comm           comm;
  const PetscInt    *localPoints;
  const PetscSFNode *remotePoints;
  PetscInt           lpStart, lpEnd;
  PetscInt           numRoots, numSectionRoots, numPoints, numIndices = 0;
  PetscInt          *localIndices;
  PetscSFNode       *remoteIndices;
  PetscInt           i, ind;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
  PetscAssertPointer(rootSection, 2);
  /* Cannot check PetscAssertPointer(remoteOffsets,3) because it can be NULL if sf does not reference any points in leafSection */
  PetscAssertPointer(leafSection, 4);
  PetscAssertPointer(sectionSF, 5);
  PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
  PetscCall(PetscSFCreate(comm, sectionSF));
  PetscCall(PetscSectionGetChart(leafSection, &lpStart, &lpEnd));
  PetscCall(PetscSectionGetStorageSize(rootSection, &numSectionRoots));
  PetscCall(PetscSFGetGraph(sf, &numRoots, &numPoints, &localPoints, &remotePoints));
  if (numRoots < 0) PetscFunctionReturn(PETSC_SUCCESS);
  PetscCall(PetscLogEventBegin(PETSCSF_SectSF, sf, 0, 0, 0));
  for (i = 0; i < numPoints; ++i) {
    PetscInt localPoint = localPoints ? localPoints[i] : i;
    PetscInt dof;

    if ((localPoint >= lpStart) && (localPoint < lpEnd)) {
      PetscCall(PetscSectionGetDof(leafSection, localPoint, &dof));
      numIndices += dof < 0 ? 0 : dof;
    }
  }
  PetscCall(PetscMalloc1(numIndices, &localIndices));
  PetscCall(PetscMalloc1(numIndices, &remoteIndices));
  /* Create new index graph */
  for (i = 0, ind = 0; i < numPoints; ++i) {
    PetscInt localPoint = localPoints ? localPoints[i] : i;
    PetscInt rank       = remotePoints[i].rank;

    if ((localPoint >= lpStart) && (localPoint < lpEnd)) {
      PetscInt remoteOffset = remoteOffsets[localPoint - lpStart];
      PetscInt loff, dof, d;

      PetscCall(PetscSectionGetOffset(leafSection, localPoint, &loff));
      PetscCall(PetscSectionGetDof(leafSection, localPoint, &dof));
      for (d = 0; d < dof; ++d, ++ind) {
        localIndices[ind]        = loff + d;
        remoteIndices[ind].rank  = rank;
        remoteIndices[ind].index = remoteOffset + d;
      }
    }
  }
  PetscCheck(numIndices == ind, comm, PETSC_ERR_PLIB, "Inconsistency in indices, %" PetscInt_FMT " should be %" PetscInt_FMT, ind, numIndices);
  PetscCall(PetscSFSetGraph(*sectionSF, numSectionRoots, numIndices, localIndices, PETSC_OWN_POINTER, remoteIndices, PETSC_OWN_POINTER));
  PetscCall(PetscSFSetUp(*sectionSF));
  PetscCall(PetscLogEventEnd(PETSCSF_SectSF, sf, 0, 0, 0));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@C
  PetscSFCreateFromLayouts - Creates a parallel star forest mapping two `PetscLayout` objects

  Collective

  Input Parameters:
+ rmap - `PetscLayout` defining the global root space
- lmap - `PetscLayout` defining the global leaf space

  Output Parameter:
. sf - The parallel star forest

  Level: intermediate

.seealso: `PetscSF`, `PetscSFCreate()`, `PetscLayoutCreate()`, `PetscSFSetGraphLayout()`
@*/
PetscErrorCode PetscSFCreateFromLayouts(PetscLayout rmap, PetscLayout lmap, PetscSF *sf)
{
  PetscInt     i, nroots, nleaves = 0;
  PetscInt     rN, lst, len;
  PetscMPIInt  owner = -1;
  PetscSFNode *remote;
  MPI_Comm     rcomm = rmap->comm;
  MPI_Comm     lcomm = lmap->comm;
  PetscMPIInt  flg;

  PetscFunctionBegin;
  PetscAssertPointer(sf, 3);
  PetscCheck(rmap->setupcalled, rcomm, PETSC_ERR_ARG_WRONGSTATE, "Root layout not setup");
  PetscCheck(lmap->setupcalled, lcomm, PETSC_ERR_ARG_WRONGSTATE, "Leaf layout not setup");
  PetscCallMPI(MPI_Comm_compare(rcomm, lcomm, &flg));
  PetscCheck(flg == MPI_CONGRUENT || flg == MPI_IDENT, rcomm, PETSC_ERR_SUP, "cannot map two layouts with non-matching communicators");
  PetscCall(PetscSFCreate(rcomm, sf));
  PetscCall(PetscLayoutGetLocalSize(rmap, &nroots));
  PetscCall(PetscLayoutGetSize(rmap, &rN));
  PetscCall(PetscLayoutGetRange(lmap, &lst, &len));
  PetscCall(PetscMalloc1(len - lst, &remote));
  for (i = lst; i < len && i < rN; i++) {
    if (owner < -1 || i >= rmap->range[owner + 1]) PetscCall(PetscLayoutFindOwner(rmap, i, &owner));
    remote[nleaves].rank  = owner;
    remote[nleaves].index = i - rmap->range[owner];
    nleaves++;
  }
  PetscCall(PetscSFSetGraph(*sf, nroots, nleaves, NULL, PETSC_OWN_POINTER, remote, PETSC_COPY_VALUES));
  PetscCall(PetscFree(remote));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/* TODO: handle nooffprocentries like MatZeroRowsMapLocal_Private, since this code is the same */
PetscErrorCode PetscLayoutMapLocal(PetscLayout map, PetscInt N, const PetscInt idxs[], PetscInt *on, PetscInt *oidxs[], PetscInt *ogidxs[])
{
  PetscInt    *owners = map->range;
  PetscInt     n      = map->n;
  PetscSF      sf;
  PetscInt    *lidxs, *work = NULL, *ilocal;
  PetscSFNode *ridxs;
  PetscMPIInt  rank, p = 0;
  PetscInt     r, len = 0, nleaves = 0;

  PetscFunctionBegin;
  if (on) *on = 0; /* squelch -Wmaybe-uninitialized */
  /* Create SF where leaves are input idxs and roots are owned idxs */
  PetscCallMPI(MPI_Comm_rank(map->comm, &rank));
  PetscCall(PetscMalloc1(n, &lidxs));
  for (r = 0; r < n; ++r) lidxs[r] = -1;
  PetscCall(PetscMalloc1(N, &ridxs));
  PetscCall(PetscMalloc1(N, &ilocal));
  for (r = 0; r < N; ++r) {
    const PetscInt idx = idxs[r];

    if (idx < 0) continue;
    PetscCheck(idx < map->N, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "Index %" PetscInt_FMT " out of range [0,%" PetscInt_FMT ")", idx, map->N);
    if (idx < owners[p] || owners[p + 1] <= idx) { /* short-circuit the search if the last p owns this idx too */
      PetscCall(PetscLayoutFindOwner(map, idx, &p));
    }
    ridxs[nleaves].rank  = p;
    ridxs[nleaves].index = idxs[r] - owners[p];
    ilocal[nleaves]      = r;
    nleaves++;
  }
  PetscCall(PetscSFCreate(map->comm, &sf));
  PetscCall(PetscSFSetGraph(sf, n, nleaves, ilocal, PETSC_OWN_POINTER, ridxs, PETSC_OWN_POINTER));
  PetscCall(PetscSFReduceBegin(sf, MPIU_INT, (PetscInt *)idxs, lidxs, MPI_LOR));
  PetscCall(PetscSFReduceEnd(sf, MPIU_INT, (PetscInt *)idxs, lidxs, MPI_LOR));
  if (ogidxs) { /* communicate global idxs */
    PetscInt cum = 0, start, *work2;

    PetscCall(PetscMalloc1(n, &work));
    PetscCall(PetscCalloc1(N, &work2));
    for (r = 0; r < N; ++r)
      if (idxs[r] >= 0) cum++;
    PetscCallMPI(MPI_Scan(&cum, &start, 1, MPIU_INT, MPI_SUM, map->comm));
    start -= cum;
    cum = 0;
    for (r = 0; r < N; ++r)
      if (idxs[r] >= 0) work2[r] = start + cum++;
    PetscCall(PetscSFReduceBegin(sf, MPIU_INT, work2, work, MPI_REPLACE));
    PetscCall(PetscSFReduceEnd(sf, MPIU_INT, work2, work, MPI_REPLACE));
    PetscCall(PetscFree(work2));
  }
  PetscCall(PetscSFDestroy(&sf));
  /* Compress and put in indices */
  for (r = 0; r < n; ++r)
    if (lidxs[r] >= 0) {
      if (work) work[len] = work[r];
      lidxs[len++] = r;
    }
  if (on) *on = len;
  if (oidxs) *oidxs = lidxs;
  if (ogidxs) *ogidxs = work;
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  PetscSFCreateByMatchingIndices - Create `PetscSF` by matching root and leaf indices

  Collective

  Input Parameters:
+ layout           - `PetscLayout` defining the global index space and the rank that brokers each index
. numRootIndices   - size of rootIndices
. rootIndices      - `PetscInt` array of global indices of which this process requests ownership
. rootLocalIndices - root local index permutation (NULL if no permutation)
. rootLocalOffset  - offset to be added to root local indices
. numLeafIndices   - size of leafIndices
. leafIndices      - `PetscInt` array of global indices with which this process requires data associated
. leafLocalIndices - leaf local index permutation (NULL if no permutation)
- leafLocalOffset  - offset to be added to leaf local indices

  Output Parameters:
+ sfA - star forest representing the communication pattern from the layout space to the leaf space (NULL if not needed)
- sf  - star forest representing the communication pattern from the root space to the leaf space

  Level: advanced

  Example 1:
.vb
  rank             : 0            1            2
  rootIndices      : [1 0 2]      [3]          [3]
  rootLocalOffset  : 100          200          300
  layout           : [0 1]        [2]          [3]
  leafIndices      : [0]          [2]          [0 3]
  leafLocalOffset  : 400          500          600

would build the following PetscSF

  [0] 400 <- (0,101)
  [1] 500 <- (0,102)
  [2] 600 <- (0,101)
  [2] 601 <- (2,300)
.ve

  Example 2:
.vb
  rank             : 0               1               2
  rootIndices      : [1 0 2]         [3]             [3]
  rootLocalOffset  : 100             200             300
  layout           : [0 1]           [2]             [3]
  leafIndices      : rootIndices     rootIndices     rootIndices
  leafLocalOffset  : rootLocalOffset rootLocalOffset rootLocalOffset

would build the following PetscSF

  [1] 200 <- (2,300)
.ve

  Example 3:
.vb
  No process requests ownership of global index 1, but no process needs it.

  rank             : 0            1            2
  numRootIndices   : 2            1            1
  rootIndices      : [0 2]        [3]          [3]
  rootLocalOffset  : 100          200          300
  layout           : [0 1]        [2]          [3]
  numLeafIndices   : 1            1            2
  leafIndices      : [0]          [2]          [0 3]
  leafLocalOffset  : 400          500          600

would build the following PetscSF

  [0] 400 <- (0,100)
  [1] 500 <- (0,101)
  [2] 600 <- (0,100)
  [2] 601 <- (2,300)
.ve

  Notes:
  The layout parameter represents any partitioning of [0, N), where N is the total number of global indices, and its
  local size can be set to `PETSC_DECIDE`.

  If a global index x lies in the partition owned by process i, each process whose rootIndices contains x requests
  ownership of x and sends its own rank and the local index of x to process i.
  If multiple processes request ownership of x, the one with the highest rank is to own x.
  Process i then broadcasts the ownership information, so that each process whose leafIndices contains x knows the
  ownership information of x.
  The output sf is constructed by associating each leaf point to a root point in this way.

  Suppose there is point data ordered according to the global indices and partitioned according to the given layout.
  The optional output `PetscSF` object sfA can be used to push such data to leaf points.

  All indices in rootIndices and leafIndices must lie in the layout range. The union (over all processes) of rootIndices
  must cover that of leafIndices, but need not cover the entire layout.

  If (leafIndices, leafLocalIndices, leafLocalOffset) == (rootIndices, rootLocalIndices, rootLocalOffset), the output
  star forest is almost identity, so will only include non-trivial part of the map.

  Developer Notes:
  Current approach of a process of the highest rank gaining the ownership may cause load imbalance; consider using
  hash(rank, root_local_index) as the bid for the ownership determination.

.seealso: `PetscSF`, `PetscSFCreate()`
@*/
PetscErrorCode PetscSFCreateByMatchingIndices(PetscLayout layout, PetscInt numRootIndices, const PetscInt *rootIndices, const PetscInt *rootLocalIndices, PetscInt rootLocalOffset, PetscInt numLeafIndices, const PetscInt *leafIndices, const PetscInt *leafLocalIndices, PetscInt leafLocalOffset, PetscSF *sfA, PetscSF *sf)
{
  MPI_Comm     comm = layout->comm;
  PetscMPIInt  size, rank;
  PetscSF      sf1;
  PetscSFNode *owners, *buffer, *iremote;
  PetscInt    *ilocal, nleaves, N, n, i;
#if defined(PETSC_USE_DEBUG)
  PetscInt N1;
#endif
  PetscBool flag;

  PetscFunctionBegin;
  if (rootIndices) PetscAssertPointer(rootIndices, 3);
  if (rootLocalIndices) PetscAssertPointer(rootLocalIndices, 4);
  if (leafIndices) PetscAssertPointer(leafIndices, 7);
  if (leafLocalIndices) PetscAssertPointer(leafLocalIndices, 8);
  if (sfA) PetscAssertPointer(sfA, 10);
  PetscAssertPointer(sf, 11);
  PetscCheck(numRootIndices >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "numRootIndices (%" PetscInt_FMT ") must be non-negative", numRootIndices);
  PetscCheck(numLeafIndices >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "numLeafIndices (%" PetscInt_FMT ") must be non-negative", numLeafIndices);
  PetscCallMPI(MPI_Comm_size(comm, &size));
  PetscCallMPI(MPI_Comm_rank(comm, &rank));
  PetscCall(PetscLayoutSetUp(layout));
  PetscCall(PetscLayoutGetSize(layout, &N));
  PetscCall(PetscLayoutGetLocalSize(layout, &n));
  flag = (PetscBool)(leafIndices == rootIndices);
  PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &flag, 1, MPI_C_BOOL, MPI_LAND, comm));
  PetscCheck(!flag || numLeafIndices == numRootIndices, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "leafIndices == rootIndices, but numLeafIndices (%" PetscInt_FMT ") != numRootIndices(%" PetscInt_FMT ")", numLeafIndices, numRootIndices);
#if defined(PETSC_USE_DEBUG)
  N1 = PETSC_INT_MIN;
  for (i = 0; i < numRootIndices; i++)
    if (rootIndices[i] > N1) N1 = rootIndices[i];
  PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &N1, 1, MPIU_INT, MPI_MAX, comm));
  PetscCheck(N1 < N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Max. root index (%" PetscInt_FMT ") out of layout range [0,%" PetscInt_FMT ")", N1, N);
  if (!flag) {
    N1 = PETSC_INT_MIN;
    for (i = 0; i < numLeafIndices; i++)
      if (leafIndices[i] > N1) N1 = leafIndices[i];
    PetscCallMPI(MPIU_Allreduce(MPI_IN_PLACE, &N1, 1, MPIU_INT, MPI_MAX, comm));
    PetscCheck(N1 < N, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Max. leaf index (%" PetscInt_FMT ") out of layout range [0,%" PetscInt_FMT ")", N1, N);
  }
#endif
  /* Reduce: owners -> buffer */
  PetscCall(PetscMalloc1(n, &buffer));
  PetscCall(PetscSFCreate(comm, &sf1));
  PetscCall(PetscSFSetFromOptions(sf1));
  PetscCall(PetscSFSetGraphLayout(sf1, layout, numRootIndices, NULL, PETSC_OWN_POINTER, rootIndices));
  PetscCall(PetscMalloc1(numRootIndices, &owners));
  for (i = 0; i < numRootIndices; ++i) {
    owners[i].rank  = rank;
    owners[i].index = rootLocalOffset + (rootLocalIndices ? rootLocalIndices[i] : i);
  }
  for (i = 0; i < n; ++i) {
    buffer[i].index = -1;
    buffer[i].rank  = -1;
  }
  PetscCall(PetscSFReduceBegin(sf1, MPIU_SF_NODE, owners, buffer, MPI_MAXLOC));
  PetscCall(PetscSFReduceEnd(sf1, MPIU_SF_NODE, owners, buffer, MPI_MAXLOC));
  /* Bcast: buffer -> owners */
  if (!flag) {
    /* leafIndices is different from rootIndices */
    PetscCall(PetscFree(owners));
    PetscCall(PetscSFSetGraphLayout(sf1, layout, numLeafIndices, NULL, PETSC_OWN_POINTER, leafIndices));
    PetscCall(PetscMalloc1(numLeafIndices, &owners));
  }
  PetscCall(PetscSFBcastBegin(sf1, MPIU_SF_NODE, buffer, owners, MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(sf1, MPIU_SF_NODE, buffer, owners, MPI_REPLACE));
  for (i = 0; i < numLeafIndices; ++i) PetscCheck(owners[i].rank >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Global point %" PetscInt_FMT " was unclaimed", leafIndices[i]);
  PetscCall(PetscFree(buffer));
  if (sfA) {
    *sfA = sf1;
  } else PetscCall(PetscSFDestroy(&sf1));
  /* Create sf */
  if (flag && rootLocalIndices == leafLocalIndices && leafLocalOffset == rootLocalOffset) {
    /* leaf space == root space */
    for (i = 0, nleaves = 0; i < numLeafIndices; ++i)
      if (owners[i].rank != rank) ++nleaves;
    PetscCall(PetscMalloc1(nleaves, &ilocal));
    PetscCall(PetscMalloc1(nleaves, &iremote));
    for (i = 0, nleaves = 0; i < numLeafIndices; ++i) {
      if (owners[i].rank != rank) {
        ilocal[nleaves]        = leafLocalOffset + i;
        iremote[nleaves].rank  = owners[i].rank;
        iremote[nleaves].index = owners[i].index;
        ++nleaves;
      }
    }
    PetscCall(PetscFree(owners));
  } else {
    nleaves = numLeafIndices;
    PetscCall(PetscMalloc1(nleaves, &ilocal));
    for (i = 0; i < nleaves; ++i) ilocal[i] = leafLocalOffset + (leafLocalIndices ? leafLocalIndices[i] : i);
    iremote = owners;
  }
  PetscCall(PetscSFCreate(comm, sf));
  PetscCall(PetscSFSetFromOptions(*sf));
  PetscCall(PetscSFSetGraph(*sf, rootLocalOffset + numRootIndices, nleaves, ilocal, PETSC_OWN_POINTER, iremote, PETSC_OWN_POINTER));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  PetscSFMerge - append/merge indices of `sfb` into `sfa`, with preference for `sfb`

  Collective

  Input Parameters:
+ sfa - default `PetscSF`
- sfb - additional edges to add/replace edges in sfa

  Output Parameter:
. merged - new `PetscSF` with combined edges

  Level: intermediate

.seealso: `PetscSFCompose()`
@*/
PetscErrorCode PetscSFMerge(PetscSF sfa, PetscSF sfb, PetscSF *merged)
{
  PetscInt maxleaf;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(sfa, PETSCSF_CLASSID, 1);
  PetscValidHeaderSpecific(sfb, PETSCSF_CLASSID, 2);
  PetscCheckSameComm(sfa, 1, sfb, 2);
  PetscAssertPointer(merged, 3);
  {
    PetscInt aleaf, bleaf;
    PetscCall(PetscSFGetLeafRange(sfa, NULL, &aleaf));
    PetscCall(PetscSFGetLeafRange(sfb, NULL, &bleaf));
    maxleaf = PetscMax(aleaf, bleaf) + 1; // One more than the last index
  }
  PetscInt          *clocal, aroots, aleaves, broots, bleaves;
  PetscSFNode       *cremote;
  const PetscInt    *alocal, *blocal;
  const PetscSFNode *aremote, *bremote;
  PetscCall(PetscMalloc2(maxleaf, &clocal, maxleaf, &cremote));
  for (PetscInt i = 0; i < maxleaf; i++) clocal[i] = -1;
  PetscCall(PetscSFGetGraph(sfa, &aroots, &aleaves, &alocal, &aremote));
  PetscCall(PetscSFGetGraph(sfb, &broots, &bleaves, &blocal, &bremote));
  PetscCheck(aroots == broots, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "Both sfa and sfb must have the same root space");
  for (PetscInt i = 0; i < aleaves; i++) {
    PetscInt a = alocal ? alocal[i] : i;
    clocal[a]  = a;
    cremote[a] = aremote[i];
  }
  for (PetscInt i = 0; i < bleaves; i++) {
    PetscInt b = blocal ? blocal[i] : i;
    clocal[b]  = b;
    cremote[b] = bremote[i];
  }
  PetscInt nleaves = 0;
  for (PetscInt i = 0; i < maxleaf; i++) {
    if (clocal[i] < 0) continue;
    clocal[nleaves]  = clocal[i];
    cremote[nleaves] = cremote[i];
    nleaves++;
  }
  PetscCall(PetscSFCreate(PetscObjectComm((PetscObject)sfa), merged));
  PetscCall(PetscSFSetGraph(*merged, aroots, nleaves, clocal, PETSC_COPY_VALUES, cremote, PETSC_COPY_VALUES));
  PetscCall(PetscFree2(clocal, cremote));
  PetscFunctionReturn(PETSC_SUCCESS);
}

/*@
  PetscSFCreateStridedSF - Create an `PetscSF` to communicate interleaved blocks of data

  Collective

  Input Parameters:
+ sf  - star forest
. bs  - stride
. ldr - leading dimension of root space
- ldl - leading dimension of leaf space

  Output Parameter:
. vsf - the new `PetscSF`

  Level: intermediate

  Notes:
  This can be useful to perform communications on blocks of right-hand sides. For example, the calling sequence
.vb
  c_datatype *roots, *leaves;
  for i in [0,bs) do
    PetscSFBcastBegin(sf, mpi_datatype, roots + i*ldr, leaves + i*ldl, op)
    PetscSFBcastEnd(sf, mpi_datatype, roots + i*ldr, leaves + i*ldl, op)
.ve
  is equivalent to
.vb
  c_datatype *roots, *leaves;
  PetscSFCreateStridedSF(sf, bs, ldr, ldl, &vsf)
  PetscSFBcastBegin(vsf, mpi_datatype, roots, leaves, op)
  PetscSFBcastEnd(vsf, mpi_datatype, roots, leaves, op)
.ve

  Developer Notes:
  Should this functionality be handled with a new API instead of creating a new object?

.seealso: `PetscSF`, `PetscSFCreate()`, `PetscSFSetGraph()`
@*/
PetscErrorCode PetscSFCreateStridedSF(PetscSF sf, PetscInt bs, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
{
  PetscSF            rankssf;
  const PetscSFNode *iremote, *sfrremote;
  PetscSFNode       *viremote;
  const PetscInt    *ilocal;
  PetscInt          *vilocal = NULL, *ldrs;
  PetscInt           nranks, nr, nl, vnr, vnl, maxl;
  PetscMPIInt        rank;
  MPI_Comm           comm;
  PetscSFType        sftype;

  PetscFunctionBegin;
  PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
  PetscValidLogicalCollectiveInt(sf, bs, 2);
  PetscAssertPointer(vsf, 5);
  if (bs == 1) {
    PetscCall(PetscObjectReference((PetscObject)sf));
    *vsf = sf;
    PetscFunctionReturn(PETSC_SUCCESS);
  }
  PetscCall(PetscSFSetUp(sf));
  PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
  PetscCallMPI(MPI_Comm_rank(comm, &rank));
  PetscCall(PetscSFGetGraph(sf, &nr, &nl, &ilocal, &iremote));
  PetscCall(PetscSFGetLeafRange(sf, NULL, &maxl));
  maxl += 1;
  if (ldl == PETSC_DECIDE) ldl = maxl;
  if (ldr == PETSC_DECIDE) ldr = nr;
  PetscCheck(ldr >= nr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid leading dimension %" PetscInt_FMT " must be smaller than number of roots %" PetscInt_FMT, ldr, nr);
  PetscCheck(ldl >= maxl, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid leading dimension %" PetscInt_FMT " must be larger than leaf range %" PetscInt_FMT, ldl, maxl - 1);
  vnr = nr * bs;
  vnl = nl * bs;
  PetscCall(PetscMalloc1(vnl, &viremote));
  PetscCall(PetscMalloc1(vnl, &vilocal));

  /* Communicate root leading dimensions to leaf ranks */
  PetscCall(PetscSFGetRanksSF(sf, &rankssf));
  PetscCall(PetscSFGetGraph(rankssf, NULL, &nranks, NULL, &sfrremote));
  PetscCall(PetscMalloc1(nranks, &ldrs));
  PetscCall(PetscSFBcastBegin(rankssf, MPIU_INT, &ldr, ldrs, MPI_REPLACE));
  PetscCall(PetscSFBcastEnd(rankssf, MPIU_INT, &ldr, ldrs, MPI_REPLACE));

  for (PetscInt i = 0, rold = -1, lda = -1; i < nl; i++) {
    const PetscInt r  = iremote[i].rank;
    const PetscInt ii = iremote[i].index;

    if (r == rank) lda = ldr;
    else if (rold != r) {
      PetscInt j;

      for (j = 0; j < nranks; j++)
        if (sfrremote[j].rank == r) break;
      PetscCheck(j < nranks, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unable to locate neighbor rank %" PetscInt_FMT, r);
      lda = ldrs[j];
    }
    rold = r;
    for (PetscInt v = 0; v < bs; v++) {
      viremote[v * nl + i].rank  = r;
      viremote[v * nl + i].index = v * lda + ii;
      vilocal[v * nl + i]        = v * ldl + (ilocal ? ilocal[i] : i);
    }
  }
  PetscCall(PetscFree(ldrs));
  PetscCall(PetscSFCreate(comm, vsf));
  PetscCall(PetscSFGetType(sf, &sftype));
  PetscCall(PetscSFSetType(*vsf, sftype));
  PetscCall(PetscSFSetGraph(*vsf, vnr, vnl, vilocal, PETSC_OWN_POINTER, viremote, PETSC_OWN_POINTER));
  PetscFunctionReturn(PETSC_SUCCESS);
}
