xref: /petsc/src/vec/is/sf/impls/basic/sfpack.c (revision 511e6246d5295e363da9526c37770e41b9ae28b6)
120c24465SJunchao Zhang #include "petsc/private/sfimpl.h"
240e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfpack.h>
340e23c03SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
440e23c03SJunchao Zhang 
57fd2d3dbSJunchao Zhang /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
67fd2d3dbSJunchao Zhang 
740e23c03SJunchao Zhang /*
840e23c03SJunchao Zhang  * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
940e23c03SJunchao Zhang  * therefore we pack data types manually. This file defines packing routines for the standard data types.
1040e23c03SJunchao Zhang  */
1140e23c03SJunchao Zhang 
12cd620004SJunchao Zhang #define CPPJoin4(a, b, c, d) a##_##b##_##c##_##d
1340e23c03SJunchao Zhang 
14cd620004SJunchao Zhang /* Operations working like s += t */
159371c9d4SSatish Balay #define OP_BINARY(op, s, t) \
16d71ae5a4SJacob Faibussowitsch   do { \
17d71ae5a4SJacob Faibussowitsch     (s) = (s)op(t); \
18d71ae5a4SJacob Faibussowitsch   } while (0) /* binary ops in the middle such as +, *, && etc. */
199371c9d4SSatish Balay #define OP_FUNCTION(op, s, t) \
20d71ae5a4SJacob Faibussowitsch   do { \
21d71ae5a4SJacob Faibussowitsch     (s) = op((s), (t)); \
22d71ae5a4SJacob Faibussowitsch   } while (0) /* ops like a function, such as PetscMax, PetscMin */
239371c9d4SSatish Balay #define OP_LXOR(op, s, t) \
24d71ae5a4SJacob Faibussowitsch   do { \
25d71ae5a4SJacob Faibussowitsch     (s) = (!(s)) != (!(t)); \
26d71ae5a4SJacob Faibussowitsch   } while (0) /* logical exclusive OR */
279371c9d4SSatish Balay #define OP_ASSIGN(op, s, t) \
28d71ae5a4SJacob Faibussowitsch   do { \
29d71ae5a4SJacob Faibussowitsch     (s) = (t); \
30d71ae5a4SJacob Faibussowitsch   } while (0)
31cd620004SJunchao Zhang /* Ref MPI MAXLOC */
32cd620004SJunchao Zhang #define OP_XLOC(op, s, t) \
33cd620004SJunchao Zhang   do { \
34cd620004SJunchao Zhang     if ((s).u == (t).u) (s).i = PetscMin((s).i, (t).i); \
35cd620004SJunchao Zhang     else if (!((s).u op(t).u)) s = t; \
36cd620004SJunchao Zhang   } while (0)
3740e23c03SJunchao Zhang 
3840e23c03SJunchao Zhang /* DEF_PackFunc - macro defining a Pack routine
3940e23c03SJunchao Zhang 
4040e23c03SJunchao Zhang    Arguments of the macro:
41b23bfdefSJunchao Zhang    +Type      Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
42fcc7397dSJunchao Zhang    .BS        Block size for vectorization. It is a factor of bsz.
43b23bfdefSJunchao Zhang    -EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
4440e23c03SJunchao Zhang 
4540e23c03SJunchao Zhang    Arguments of the Pack routine:
46cd620004SJunchao Zhang    +count     Number of indices in idx[].
47fcc7397dSJunchao Zhang    .start     When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
48fcc7397dSJunchao Zhang    .opt       Per-pack optimization plan. NULL means no such plan.
49fcc7397dSJunchao Zhang    .idx       Indices of entries to packed.
50eb02082bSJunchao Zhang    .link      Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
51cd620004SJunchao Zhang    .unpacked  Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
52cd620004SJunchao Zhang    -packed    Address of the packed data.
5340e23c03SJunchao Zhang  */
54b23bfdefSJunchao Zhang #define DEF_PackFunc(Type, BS, EQ) \
55d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(Pack, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, const void *unpacked, void *packed) \
56d71ae5a4SJacob Faibussowitsch   { \
57b23bfdefSJunchao Zhang     const Type    *u = (const Type *)unpacked, *u2; \
58b23bfdefSJunchao Zhang     Type          *p = (Type *)packed, *p2; \
59fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
60fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
61b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
6240e23c03SJunchao Zhang     PetscFunctionBegin; \
639566063dSJacob Faibussowitsch     if (!idx) PetscCall(PetscArraycpy(p, u + start * MBS, MBS * count)); /* idx[] are contiguous */ \
649371c9d4SSatish Balay     else if (opt) { /* has optimizations available */ p2 = p; \
65fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
66fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
67fcc7397dSJunchao Zhang         X  = opt->X[r]; \
68fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
69fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
70fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
719566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(p2, u2 + (X * Y * k + X * j) * MBS, opt->dx[r] * MBS)); \
72fcc7397dSJunchao Zhang             p2 += opt->dx[r] * MBS; \
73fcc7397dSJunchao Zhang           } \
74fcc7397dSJunchao Zhang       } \
75fcc7397dSJunchao Zhang     } else { \
76b23bfdefSJunchao Zhang       for (i = 0; i < count; i++) \
77eb02082bSJunchao Zhang         for (j = 0; j < M; j++)    /* Decent compilers should eliminate this loop when M = const 1 */ \
78eb02082bSJunchao Zhang           for (k = 0; k < BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
79b23bfdefSJunchao Zhang             p[i * MBS + j * BS + k] = u[idx[i] * MBS + j * BS + k]; \
8040e23c03SJunchao Zhang     } \
813ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
8240e23c03SJunchao Zhang   }
8340e23c03SJunchao Zhang 
84cd620004SJunchao Zhang /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
85cd620004SJunchao Zhang                 and inserts into a sparse array.
8640e23c03SJunchao Zhang 
8740e23c03SJunchao Zhang    Arguments:
88b23bfdefSJunchao Zhang   .Type       Type of the data
8940e23c03SJunchao Zhang   .BS         Block size for vectorization
90b23bfdefSJunchao Zhang   .EQ        (bs == BS) ? 1 : 0. EQ is a compile-time const.
9140e23c03SJunchao Zhang 
9240e23c03SJunchao Zhang   Notes:
9340e23c03SJunchao Zhang    This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
9440e23c03SJunchao Zhang  */
95cd620004SJunchao Zhang #define DEF_UnpackFunc(Type, BS, EQ) \
96d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(UnpackAndInsert, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
97d71ae5a4SJacob Faibussowitsch   { \
98b23bfdefSJunchao Zhang     Type          *u = (Type *)unpacked, *u2; \
99fcc7397dSJunchao Zhang     const Type    *p = (const Type *)packed; \
100fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
101fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
102b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
10340e23c03SJunchao Zhang     PetscFunctionBegin; \
104b23bfdefSJunchao Zhang     if (!idx) { \
105fcc7397dSJunchao Zhang       u += start * MBS; \
1069566063dSJacob Faibussowitsch       if (u != p) PetscCall(PetscArraycpy(u, p, count *MBS)); \
107fcc7397dSJunchao Zhang     } else if (opt) { /* has optimizations available */ \
108fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
109fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
110fcc7397dSJunchao Zhang         X  = opt->X[r]; \
111fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
112fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
113fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
1149566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(u2 + (X * Y * k + X * j) * MBS, p, opt->dx[r] * MBS)); \
115fcc7397dSJunchao Zhang             p += opt->dx[r] * MBS; \
116fcc7397dSJunchao Zhang           } \
117fcc7397dSJunchao Zhang       } \
118fcc7397dSJunchao Zhang     } else { \
119b23bfdefSJunchao Zhang       for (i = 0; i < count; i++) \
120b23bfdefSJunchao Zhang         for (j = 0; j < M; j++) \
121cd620004SJunchao Zhang           for (k = 0; k < BS; k++) u[idx[i] * MBS + j * BS + k] = p[i * MBS + j * BS + k]; \
12240e23c03SJunchao Zhang     } \
1233ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
12440e23c03SJunchao Zhang   }
12540e23c03SJunchao Zhang 
126cd620004SJunchao Zhang /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
12740e23c03SJunchao Zhang 
12840e23c03SJunchao Zhang    Arguments:
129cd620004SJunchao Zhang   +Opname     Name of the Op, such as Add, Mult, LAND, etc.
130b23bfdefSJunchao Zhang   .Type       Type of the data
13140e23c03SJunchao Zhang   .BS         Block size for vectorization
132b23bfdefSJunchao Zhang   .EQ         (bs == BS) ? 1 : 0. EQ is a compile-time const.
133cd620004SJunchao Zhang   .Op         Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
134cd620004SJunchao Zhang   .OpApply    Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
13540e23c03SJunchao Zhang  */
136cd620004SJunchao Zhang #define DEF_UnpackAndOp(Type, BS, EQ, Opname, Op, OpApply) \
137d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, const void *packed) \
138d71ae5a4SJacob Faibussowitsch   { \
139cd620004SJunchao Zhang     Type          *u = (Type *)unpacked, *u2; \
140fcc7397dSJunchao Zhang     const Type    *p = (const Type *)packed; \
141fcc7397dSJunchao Zhang     PetscInt       i, j, k, X, Y, r, bs = link->bs; \
142fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
143b23bfdefSJunchao Zhang     const PetscInt MBS = M * BS;             /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
14440e23c03SJunchao Zhang     PetscFunctionBegin; \
145b23bfdefSJunchao Zhang     if (!idx) { \
146fcc7397dSJunchao Zhang       u += start * MBS; \
147cd620004SJunchao Zhang       for (i = 0; i < count; i++) \
148cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
1499371c9d4SSatish Balay           for (k = 0; k < BS; k++) OpApply(Op, u[i * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
150fcc7397dSJunchao Zhang     } else if (opt) { /* idx[] has patterns */ \
151fcc7397dSJunchao Zhang       for (r = 0; r < opt->n; r++) { \
152fcc7397dSJunchao Zhang         u2 = u + opt->start[r] * MBS; \
153fcc7397dSJunchao Zhang         X  = opt->X[r]; \
154fcc7397dSJunchao Zhang         Y  = opt->Y[r]; \
155fcc7397dSJunchao Zhang         for (k = 0; k < opt->dz[r]; k++) \
156fcc7397dSJunchao Zhang           for (j = 0; j < opt->dy[r]; j++) { \
157fcc7397dSJunchao Zhang             for (i = 0; i < opt->dx[r] * MBS; i++) OpApply(Op, u2[(X * Y * k + X * j) * MBS + i], p[i]); \
158fcc7397dSJunchao Zhang             p += opt->dx[r] * MBS; \
159fcc7397dSJunchao Zhang           } \
160fcc7397dSJunchao Zhang       } \
161fcc7397dSJunchao Zhang     } else { \
162cd620004SJunchao Zhang       for (i = 0; i < count; i++) \
163cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
1649371c9d4SSatish Balay           for (k = 0; k < BS; k++) OpApply(Op, u[idx[i] * MBS + j * BS + k], p[i * MBS + j * BS + k]); \
165cd620004SJunchao Zhang     } \
1663ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
167cd620004SJunchao Zhang   }
168cd620004SJunchao Zhang 
169cd620004SJunchao Zhang #define DEF_FetchAndOp(Type, BS, EQ, Opname, Op, OpApply) \
170d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(FetchAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt start, PetscSFPackOpt opt, const PetscInt *idx, void *unpacked, void *packed) \
171d71ae5a4SJacob Faibussowitsch   { \
172fcc7397dSJunchao Zhang     Type          *u = (Type *)unpacked, *p = (Type *)packed, tmp; \
173fcc7397dSJunchao Zhang     PetscInt       i, j, k, r, l, bs = link->bs; \
174fcc7397dSJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
175fcc7397dSJunchao Zhang     const PetscInt MBS = M * BS; \
176cd620004SJunchao Zhang     PetscFunctionBegin; \
177fcc7397dSJunchao Zhang     for (i = 0; i < count; i++) { \
178fcc7397dSJunchao Zhang       r = (!idx ? start + i : idx[i]) * MBS; \
179fcc7397dSJunchao Zhang       l = i * MBS; \
180b23bfdefSJunchao Zhang       for (j = 0; j < M; j++) \
181b23bfdefSJunchao Zhang         for (k = 0; k < BS; k++) { \
182fcc7397dSJunchao Zhang           tmp = u[r + j * BS + k]; \
183fcc7397dSJunchao Zhang           OpApply(Op, u[r + j * BS + k], p[l + j * BS + k]); \
184fcc7397dSJunchao Zhang           p[l + j * BS + k] = tmp; \
185cd620004SJunchao Zhang         } \
186cd620004SJunchao Zhang     } \
1873ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
188cd620004SJunchao Zhang   }
189cd620004SJunchao Zhang 
190cd620004SJunchao Zhang #define DEF_ScatterAndOp(Type, BS, EQ, Opname, Op, OpApply) \
191d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(ScatterAnd##Opname, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt srcStart, PetscSFPackOpt srcOpt, const PetscInt *srcIdx, const void *src, PetscInt dstStart, PetscSFPackOpt dstOpt, const PetscInt *dstIdx, void *dst) \
192d71ae5a4SJacob Faibussowitsch   { \
193fcc7397dSJunchao Zhang     const Type    *u = (const Type *)src; \
194fcc7397dSJunchao Zhang     Type          *v = (Type *)dst; \
195fcc7397dSJunchao Zhang     PetscInt       i, j, k, s, t, X, Y, bs = link->bs; \
196cd620004SJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
197cd620004SJunchao Zhang     const PetscInt MBS = M * BS; \
198cd620004SJunchao Zhang     PetscFunctionBegin; \
199fcc7397dSJunchao Zhang     if (!srcIdx) { /* src is contiguous */ \
200fcc7397dSJunchao Zhang       u += srcStart * MBS; \
2019566063dSJacob Faibussowitsch       PetscCall(CPPJoin4(UnpackAnd##Opname, Type, BS, EQ)(link, count, dstStart, dstOpt, dstIdx, dst, u)); \
202fcc7397dSJunchao Zhang     } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
203fcc7397dSJunchao Zhang       u += srcOpt->start[0] * MBS; \
204fcc7397dSJunchao Zhang       v += dstStart * MBS; \
2059371c9d4SSatish Balay       X = srcOpt->X[0]; \
2069371c9d4SSatish Balay       Y = srcOpt->Y[0]; \
207fcc7397dSJunchao Zhang       for (k = 0; k < srcOpt->dz[0]; k++) \
208fcc7397dSJunchao Zhang         for (j = 0; j < srcOpt->dy[0]; j++) { \
209fcc7397dSJunchao Zhang           for (i = 0; i < srcOpt->dx[0] * MBS; i++) OpApply(Op, v[i], u[(X * Y * k + X * j) * MBS + i]); \
210fcc7397dSJunchao Zhang           v += srcOpt->dx[0] * MBS; \
211fcc7397dSJunchao Zhang         } \
212fcc7397dSJunchao Zhang     } else { /* all other cases */ \
213fcc7397dSJunchao Zhang       for (i = 0; i < count; i++) { \
214fcc7397dSJunchao Zhang         s = (!srcIdx ? srcStart + i : srcIdx[i]) * MBS; \
215fcc7397dSJunchao Zhang         t = (!dstIdx ? dstStart + i : dstIdx[i]) * MBS; \
216cd620004SJunchao Zhang         for (j = 0; j < M; j++) \
217fcc7397dSJunchao Zhang           for (k = 0; k < BS; k++) OpApply(Op, v[t + j * BS + k], u[s + j * BS + k]); \
218fcc7397dSJunchao Zhang       } \
219cd620004SJunchao Zhang     } \
2203ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
221cd620004SJunchao Zhang   }
222cd620004SJunchao Zhang 
223cd620004SJunchao Zhang #define DEF_FetchAndOpLocal(Type, BS, EQ, Opname, Op, OpApply) \
224d71ae5a4SJacob Faibussowitsch   static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local, Type, BS, EQ)(PetscSFLink link, PetscInt count, PetscInt rootstart, PetscSFPackOpt rootopt, const PetscInt *rootidx, void *rootdata, PetscInt leafstart, PetscSFPackOpt leafopt, const PetscInt *leafidx, const void *leafdata, void *leafupdate) \
225d71ae5a4SJacob Faibussowitsch   { \
226fcc7397dSJunchao Zhang     Type          *rdata = (Type *)rootdata, *lupdate = (Type *)leafupdate; \
227fcc7397dSJunchao Zhang     const Type    *ldata = (const Type *)leafdata; \
228fcc7397dSJunchao Zhang     PetscInt       i, j, k, r, l, bs = link->bs; \
229cd620004SJunchao Zhang     const PetscInt M   = (EQ) ? 1 : bs / BS; \
230cd620004SJunchao Zhang     const PetscInt MBS = M * BS; \
231cd620004SJunchao Zhang     PetscFunctionBegin; \
232fcc7397dSJunchao Zhang     for (i = 0; i < count; i++) { \
233fcc7397dSJunchao Zhang       r = (rootidx ? rootidx[i] : rootstart + i) * MBS; \
234fcc7397dSJunchao Zhang       l = (leafidx ? leafidx[i] : leafstart + i) * MBS; \
235cd620004SJunchao Zhang       for (j = 0; j < M; j++) \
236cd620004SJunchao Zhang         for (k = 0; k < BS; k++) { \
237fcc7397dSJunchao Zhang           lupdate[l + j * BS + k] = rdata[r + j * BS + k]; \
238fcc7397dSJunchao Zhang           OpApply(Op, rdata[r + j * BS + k], ldata[l + j * BS + k]); \
23940e23c03SJunchao Zhang         } \
24040e23c03SJunchao Zhang     } \
2413ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS); \
24240e23c03SJunchao Zhang   }
24340e23c03SJunchao Zhang 
244b23bfdefSJunchao Zhang /* Pack, Unpack/Fetch ops */
245b23bfdefSJunchao Zhang #define DEF_Pack(Type, BS, EQ) \
246d71ae5a4SJacob Faibussowitsch   DEF_PackFunc(Type, BS, EQ) DEF_UnpackFunc(Type, BS, EQ) DEF_ScatterAndOp(Type, BS, EQ, Insert, =, OP_ASSIGN) static void CPPJoin4(PackInit_Pack, Type, BS, EQ)(PetscSFLink link) \
247d71ae5a4SJacob Faibussowitsch   { \
248eb02082bSJunchao Zhang     link->h_Pack             = CPPJoin4(Pack, Type, BS, EQ); \
249eb02082bSJunchao Zhang     link->h_UnpackAndInsert  = CPPJoin4(UnpackAndInsert, Type, BS, EQ); \
250cd620004SJunchao Zhang     link->h_ScatterAndInsert = CPPJoin4(ScatterAndInsert, Type, BS, EQ); \
25140e23c03SJunchao Zhang   }
25240e23c03SJunchao Zhang 
253b23bfdefSJunchao Zhang /* Add, Mult ops */
254b23bfdefSJunchao Zhang #define DEF_Add(Type, BS, EQ) \
255d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Add, +, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, Mult, *, OP_BINARY) DEF_FetchAndOpLocal(Type, BS, EQ, Add, +, OP_BINARY) static void CPPJoin4(PackInit_Add, Type, BS, EQ)(PetscSFLink link) \
256d71ae5a4SJacob Faibussowitsch   { \
257eb02082bSJunchao Zhang     link->h_UnpackAndAdd     = CPPJoin4(UnpackAndAdd, Type, BS, EQ); \
258eb02082bSJunchao Zhang     link->h_UnpackAndMult    = CPPJoin4(UnpackAndMult, Type, BS, EQ); \
259eb02082bSJunchao Zhang     link->h_FetchAndAdd      = CPPJoin4(FetchAndAdd, Type, BS, EQ); \
260cd620004SJunchao Zhang     link->h_ScatterAndAdd    = CPPJoin4(ScatterAndAdd, Type, BS, EQ); \
261cd620004SJunchao Zhang     link->h_ScatterAndMult   = CPPJoin4(ScatterAndMult, Type, BS, EQ); \
262cd620004SJunchao Zhang     link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal, Type, BS, EQ); \
26340e23c03SJunchao Zhang   }
26440e23c03SJunchao Zhang 
265b23bfdefSJunchao Zhang /* Max, Min ops */
266b23bfdefSJunchao Zhang #define DEF_Cmp(Type, BS, EQ) \
267d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_UnpackAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Max, PetscMax, OP_FUNCTION) DEF_ScatterAndOp(Type, BS, EQ, Min, PetscMin, OP_FUNCTION) static void CPPJoin4(PackInit_Compare, Type, BS, EQ)(PetscSFLink link) \
268d71ae5a4SJacob Faibussowitsch   { \
269eb02082bSJunchao Zhang     link->h_UnpackAndMax  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
270eb02082bSJunchao Zhang     link->h_UnpackAndMin  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
271cd620004SJunchao Zhang     link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
272cd620004SJunchao Zhang     link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
273b23bfdefSJunchao Zhang   }
274b23bfdefSJunchao Zhang 
275b23bfdefSJunchao Zhang /* Logical ops.
276cd620004SJunchao Zhang   The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
27740e23c03SJunchao Zhang   the compilation warning "empty macro arguments are undefined in ISO C90"
27840e23c03SJunchao Zhang  */
279b23bfdefSJunchao Zhang #define DEF_Log(Type, BS, EQ) \
280d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) DEF_ScatterAndOp(Type, BS, EQ, LAND, &&, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LOR, ||, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, LXOR, ||, OP_LXOR) static void CPPJoin4(PackInit_Logical, Type, BS, EQ)(PetscSFLink link) \
281d71ae5a4SJacob Faibussowitsch   { \
282eb02082bSJunchao Zhang     link->h_UnpackAndLAND  = CPPJoin4(UnpackAndLAND, Type, BS, EQ); \
283eb02082bSJunchao Zhang     link->h_UnpackAndLOR   = CPPJoin4(UnpackAndLOR, Type, BS, EQ); \
284eb02082bSJunchao Zhang     link->h_UnpackAndLXOR  = CPPJoin4(UnpackAndLXOR, Type, BS, EQ); \
285cd620004SJunchao Zhang     link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND, Type, BS, EQ); \
286cd620004SJunchao Zhang     link->h_ScatterAndLOR  = CPPJoin4(ScatterAndLOR, Type, BS, EQ); \
287cd620004SJunchao Zhang     link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR, Type, BS, EQ); \
28840e23c03SJunchao Zhang   }
28940e23c03SJunchao Zhang 
290b23bfdefSJunchao Zhang /* Bitwise ops */
291b23bfdefSJunchao Zhang #define DEF_Bit(Type, BS, EQ) \
292d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_UnpackAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BAND, &, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BOR, |, OP_BINARY) DEF_ScatterAndOp(Type, BS, EQ, BXOR, ^, OP_BINARY) static void CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(PetscSFLink link) \
293d71ae5a4SJacob Faibussowitsch   { \
294eb02082bSJunchao Zhang     link->h_UnpackAndBAND  = CPPJoin4(UnpackAndBAND, Type, BS, EQ); \
295eb02082bSJunchao Zhang     link->h_UnpackAndBOR   = CPPJoin4(UnpackAndBOR, Type, BS, EQ); \
296eb02082bSJunchao Zhang     link->h_UnpackAndBXOR  = CPPJoin4(UnpackAndBXOR, Type, BS, EQ); \
297cd620004SJunchao Zhang     link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND, Type, BS, EQ); \
298cd620004SJunchao Zhang     link->h_ScatterAndBOR  = CPPJoin4(ScatterAndBOR, Type, BS, EQ); \
299cd620004SJunchao Zhang     link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR, Type, BS, EQ); \
30040e23c03SJunchao Zhang   }
30140e23c03SJunchao Zhang 
302cd620004SJunchao Zhang /* Maxloc, Minloc ops */
303cd620004SJunchao Zhang #define DEF_Xloc(Type, BS, EQ) \
304d71ae5a4SJacob Faibussowitsch   DEF_UnpackAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_UnpackAndOp(Type, BS, EQ, Min, <, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Max, >, OP_XLOC) DEF_ScatterAndOp(Type, BS, EQ, Min, <, OP_XLOC) static void CPPJoin4(PackInit_Xloc, Type, BS, EQ)(PetscSFLink link) \
305d71ae5a4SJacob Faibussowitsch   { \
306cd620004SJunchao Zhang     link->h_UnpackAndMaxloc  = CPPJoin4(UnpackAndMax, Type, BS, EQ); \
307cd620004SJunchao Zhang     link->h_UnpackAndMinloc  = CPPJoin4(UnpackAndMin, Type, BS, EQ); \
308cd620004SJunchao Zhang     link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax, Type, BS, EQ); \
309cd620004SJunchao Zhang     link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin, Type, BS, EQ); \
31040e23c03SJunchao Zhang   }
31140e23c03SJunchao Zhang 
312b23bfdefSJunchao Zhang #define DEF_IntegerType(Type, BS, EQ) \
313d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) DEF_Log(Type, BS, EQ) DEF_Bit(Type, BS, EQ) static void CPPJoin4(PackInit_IntegerType, Type, BS, EQ)(PetscSFLink link) \
314d71ae5a4SJacob Faibussowitsch   { \
315b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
316b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
317b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
318b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Logical, Type, BS, EQ)(link); \
319b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Bitwise, Type, BS, EQ)(link); \
32040e23c03SJunchao Zhang   }
32140e23c03SJunchao Zhang 
322b23bfdefSJunchao Zhang #define DEF_RealType(Type, BS, EQ) \
323d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) DEF_Cmp(Type, BS, EQ) static void CPPJoin4(PackInit_RealType, Type, BS, EQ)(PetscSFLink link) \
324d71ae5a4SJacob Faibussowitsch   { \
325b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
326b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
327b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Compare, Type, BS, EQ)(link); \
328b23bfdefSJunchao Zhang   }
32940e23c03SJunchao Zhang 
33040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
331b23bfdefSJunchao Zhang   #define DEF_ComplexType(Type, BS, EQ) \
332d71ae5a4SJacob Faibussowitsch     DEF_Pack(Type, BS, EQ) DEF_Add(Type, BS, EQ) static void CPPJoin4(PackInit_ComplexType, Type, BS, EQ)(PetscSFLink link) \
333d71ae5a4SJacob Faibussowitsch     { \
334b23bfdefSJunchao Zhang       CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
335b23bfdefSJunchao Zhang       CPPJoin4(PackInit_Add, Type, BS, EQ)(link); \
336b23bfdefSJunchao Zhang     }
33740e23c03SJunchao Zhang #endif
338b23bfdefSJunchao Zhang 
339b23bfdefSJunchao Zhang #define DEF_DumbType(Type, BS, EQ) \
340d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) static void CPPJoin4(PackInit_DumbType, Type, BS, EQ)(PetscSFLink link) \
341d71ae5a4SJacob Faibussowitsch   { \
342b23bfdefSJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
343b23bfdefSJunchao Zhang   }
344b23bfdefSJunchao Zhang 
345b23bfdefSJunchao Zhang /* Maxloc, Minloc */
346cd620004SJunchao Zhang #define DEF_PairType(Type, BS, EQ) \
347d71ae5a4SJacob Faibussowitsch   DEF_Pack(Type, BS, EQ) DEF_Xloc(Type, BS, EQ) static void CPPJoin4(PackInit_PairType, Type, BS, EQ)(PetscSFLink link) \
348d71ae5a4SJacob Faibussowitsch   { \
349cd620004SJunchao Zhang     CPPJoin4(PackInit_Pack, Type, BS, EQ)(link); \
350cd620004SJunchao Zhang     CPPJoin4(PackInit_Xloc, Type, BS, EQ)(link); \
351b23bfdefSJunchao Zhang   }
352b23bfdefSJunchao Zhang 
353b23bfdefSJunchao Zhang DEF_IntegerType(PetscInt, 1, 1)   /* unit = 1 MPIU_INT  */
354b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 2, 1) /* unit = 2 MPIU_INTs */
355b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 4, 1) /* unit = 4 MPIU_INTs */
356b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 8, 1) /* unit = 8 MPIU_INTs */
357b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 1, 0) /* unit = 1*n MPIU_INTs, n>1 */
358b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 2, 0) /* unit = 2*n MPIU_INTs, n>1 */
359b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 4, 0) /* unit = 4*n MPIU_INTs, n>1 */
360b23bfdefSJunchao Zhang   DEF_IntegerType(PetscInt, 8, 0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
361b23bfdefSJunchao Zhang 
362b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
3639371c9d4SSatish Balay   DEF_IntegerType(int, 1, 1) DEF_IntegerType(int, 2, 1) DEF_IntegerType(int, 4, 1) DEF_IntegerType(int, 8, 1) DEF_IntegerType(int, 1, 0) DEF_IntegerType(int, 2, 0) DEF_IntegerType(int, 4, 0) DEF_IntegerType(int, 8, 0)
364b23bfdefSJunchao Zhang #endif
365b23bfdefSJunchao Zhang 
366b23bfdefSJunchao Zhang   /* The typedefs are used to get a typename without space that CPPJoin can handle */
367b23bfdefSJunchao Zhang   typedef signed char SignedChar;
3689371c9d4SSatish Balay DEF_IntegerType(SignedChar, 1, 1) DEF_IntegerType(SignedChar, 2, 1) DEF_IntegerType(SignedChar, 4, 1) DEF_IntegerType(SignedChar, 8, 1) DEF_IntegerType(SignedChar, 1, 0) DEF_IntegerType(SignedChar, 2, 0) DEF_IntegerType(SignedChar, 4, 0) DEF_IntegerType(SignedChar, 8, 0)
369b23bfdefSJunchao Zhang 
370b23bfdefSJunchao Zhang   typedef unsigned char UnsignedChar;
3719371c9d4SSatish Balay DEF_IntegerType(UnsignedChar, 1, 1) DEF_IntegerType(UnsignedChar, 2, 1) DEF_IntegerType(UnsignedChar, 4, 1) DEF_IntegerType(UnsignedChar, 8, 1) DEF_IntegerType(UnsignedChar, 1, 0) DEF_IntegerType(UnsignedChar, 2, 0) DEF_IntegerType(UnsignedChar, 4, 0) DEF_IntegerType(UnsignedChar, 8, 0)
372b23bfdefSJunchao Zhang 
3739371c9d4SSatish Balay   DEF_RealType(PetscReal, 1, 1) DEF_RealType(PetscReal, 2, 1) DEF_RealType(PetscReal, 4, 1) DEF_RealType(PetscReal, 8, 1) DEF_RealType(PetscReal, 1, 0) DEF_RealType(PetscReal, 2, 0) DEF_RealType(PetscReal, 4, 0) DEF_RealType(PetscReal, 8, 0)
374b23bfdefSJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
3759371c9d4SSatish Balay     DEF_ComplexType(PetscComplex, 1, 1) DEF_ComplexType(PetscComplex, 2, 1) DEF_ComplexType(PetscComplex, 4, 1) DEF_ComplexType(PetscComplex, 8, 1) DEF_ComplexType(PetscComplex, 1, 0) DEF_ComplexType(PetscComplex, 2, 0) DEF_ComplexType(PetscComplex, 4, 0) DEF_ComplexType(PetscComplex, 8, 0)
376b23bfdefSJunchao Zhang #endif
377b23bfdefSJunchao Zhang 
378cd620004SJunchao Zhang #define PairType(Type1, Type2) Type1##_##Type2
3799371c9d4SSatish Balay       typedef struct {
3809371c9d4SSatish Balay   int u;
3819371c9d4SSatish Balay   int i;
3829371c9d4SSatish Balay } PairType(int, int);
3839371c9d4SSatish Balay typedef struct {
3849371c9d4SSatish Balay   PetscInt u;
3859371c9d4SSatish Balay   PetscInt i;
3869371c9d4SSatish Balay } PairType(PetscInt, PetscInt);
3879371c9d4SSatish Balay DEF_PairType(PairType(int, int), 1, 1) DEF_PairType(PairType(PetscInt, PetscInt), 1, 1)
388b23bfdefSJunchao Zhang 
389b23bfdefSJunchao Zhang   /* If we don't know the basic type, we treat it as a stream of chars or ints */
3909371c9d4SSatish Balay   DEF_DumbType(char, 1, 1) DEF_DumbType(char, 2, 1) DEF_DumbType(char, 4, 1) DEF_DumbType(char, 1, 0) DEF_DumbType(char, 2, 0) DEF_DumbType(char, 4, 0)
391b23bfdefSJunchao Zhang 
392eb02082bSJunchao Zhang     typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
3939371c9d4SSatish Balay DEF_DumbType(DumbInt, 1, 1) DEF_DumbType(DumbInt, 2, 1) DEF_DumbType(DumbInt, 4, 1) DEF_DumbType(DumbInt, 8, 1) DEF_DumbType(DumbInt, 1, 0) DEF_DumbType(DumbInt, 2, 0) DEF_DumbType(DumbInt, 4, 0) DEF_DumbType(DumbInt, 8, 0)
39440e23c03SJunchao Zhang 
395d71ae5a4SJacob Faibussowitsch   PetscErrorCode PetscSFLinkDestroy(PetscSF sf, PetscSFLink link)
396d71ae5a4SJacob Faibussowitsch {
397cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
39871438e86SJunchao Zhang   PetscInt       i, nreqs = (bas->nrootreqs + sf->nleafreqs) * 8;
39971438e86SJunchao Zhang 
40071438e86SJunchao Zhang   PetscFunctionBegin;
40171438e86SJunchao Zhang   /* Destroy device-specific fields */
4029566063dSJacob Faibussowitsch   if (link->deviceinited) PetscCall((*link->Destroy)(sf, link));
40371438e86SJunchao Zhang 
40471438e86SJunchao Zhang   /* Destroy host related fields */
4059566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_free(&link->unit));
40671438e86SJunchao Zhang   if (!link->use_nvshmem) {
40771438e86SJunchao Zhang     for (i = 0; i < nreqs; i++) { /* Persistent reqs must be freed. */
4089566063dSJacob Faibussowitsch       if (link->reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&link->reqs[i]));
40971438e86SJunchao Zhang     }
4109566063dSJacob Faibussowitsch     PetscCall(PetscFree(link->reqs));
41171438e86SJunchao Zhang     for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
4129566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]));
4139566063dSJacob Faibussowitsch       PetscCall(PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]));
41471438e86SJunchao Zhang     }
41571438e86SJunchao Zhang   }
4169566063dSJacob Faibussowitsch   PetscCall(PetscFree(link));
4173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41871438e86SJunchao Zhang }
41971438e86SJunchao Zhang 
420d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkCreate(PetscSF sf, MPI_Datatype unit, PetscMemType rootmtype, const void *rootdata, PetscMemType leafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
421d71ae5a4SJacob Faibussowitsch {
422cd620004SJunchao Zhang   PetscFunctionBegin;
4239566063dSJacob Faibussowitsch   PetscCall(PetscSFSetErrorOnUnsupportedOverlap(sf, unit, rootdata, leafdata));
42471438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
42571438e86SJunchao Zhang   {
42671438e86SJunchao Zhang     PetscBool use_nvshmem;
4279566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkNvshmemCheck(sf, rootmtype, rootdata, leafmtype, leafdata, &use_nvshmem));
42871438e86SJunchao Zhang     if (use_nvshmem) {
4299566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkCreate_NVSHMEM(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
4303ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
431cd620004SJunchao Zhang     }
432cd620004SJunchao Zhang   }
4337fd2d3dbSJunchao Zhang #endif
4349566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkCreate_MPI(sf, unit, rootmtype, rootdata, leafmtype, leafdata, op, sfop, mylink));
4353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
436cd620004SJunchao Zhang }
437cd620004SJunchao Zhang 
438cd620004SJunchao Zhang /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
439cd620004SJunchao Zhang    If the sf uses persistent requests and the requests have not been initialized, then initialize them.
440cd620004SJunchao Zhang */
441d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
442d71ae5a4SJacob Faibussowitsch {
443cd620004SJunchao Zhang   PetscSF_Basic     *bas = (PetscSF_Basic *)sf->data;
444c87b50c4SJunchao Zhang   PetscInt           i, j, cnt, nrootranks, ndrootranks, nleafranks, ndleafranks;
445cd620004SJunchao Zhang   const PetscInt    *rootoffset, *leafoffset;
446cd620004SJunchao Zhang   MPI_Aint           disp;
447cd620004SJunchao Zhang   MPI_Comm           comm          = PetscObjectComm((PetscObject)sf);
448cd620004SJunchao Zhang   MPI_Datatype       unit          = link->unit;
449cd620004SJunchao Zhang   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
450cd620004SJunchao Zhang   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
451cd620004SJunchao Zhang 
452cd620004SJunchao Zhang   PetscFunctionBegin;
453cd620004SJunchao Zhang   /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
454cd620004SJunchao Zhang   if (sf->persistent) {
455cd620004SJunchao Zhang     if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
4569566063dSJacob Faibussowitsch       PetscCall(PetscSFGetRootInfo_Basic(sf, &nrootranks, &ndrootranks, NULL, &rootoffset, NULL));
457cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
458cd620004SJunchao Zhang         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
459cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
460c87b50c4SJunchao Zhang           cnt  = rootoffset[i + 1] - rootoffset[i];
4619566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
462cd620004SJunchao Zhang         }
463cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
464cd620004SJunchao Zhang         for (i = ndrootranks, j = 0; i < nrootranks; i++, j++) {
465cd620004SJunchao Zhang           disp = (rootoffset[i] - rootoffset[ndrootranks]) * link->unitbytes;
466c87b50c4SJunchao Zhang           cnt  = rootoffset[i + 1] - rootoffset[i];
4679566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi] + disp, cnt, unit, bas->iranks[i], link->tag, comm, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi] + j));
468cd620004SJunchao Zhang         }
469cd620004SJunchao Zhang       }
470cd620004SJunchao Zhang       link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
471cd620004SJunchao Zhang     }
472cd620004SJunchao Zhang 
473cd620004SJunchao Zhang     if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
4749566063dSJacob Faibussowitsch       PetscCall(PetscSFGetLeafInfo_Basic(sf, &nleafranks, &ndleafranks, NULL, &leafoffset, NULL, NULL));
475cd620004SJunchao Zhang       if (direction == PETSCSF_LEAF2ROOT) {
476cd620004SJunchao Zhang         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
477cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
478c87b50c4SJunchao Zhang           cnt  = leafoffset[i + 1] - leafoffset[i];
4799566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
480cd620004SJunchao Zhang         }
481cd620004SJunchao Zhang       } else { /* PETSCSF_ROOT2LEAF */
482cd620004SJunchao Zhang         for (i = ndleafranks, j = 0; i < nleafranks; i++, j++) {
483cd620004SJunchao Zhang           disp = (leafoffset[i] - leafoffset[ndleafranks]) * link->unitbytes;
484c87b50c4SJunchao Zhang           cnt  = leafoffset[i + 1] - leafoffset[i];
4859566063dSJacob Faibussowitsch           PetscCallMPI(MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi] + disp, cnt, unit, sf->ranks[i], link->tag, comm, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi] + j));
486cd620004SJunchao Zhang         }
487cd620004SJunchao Zhang       }
488cd620004SJunchao Zhang       link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
489cd620004SJunchao Zhang     }
490cd620004SJunchao Zhang   }
491cd620004SJunchao Zhang   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
492cd620004SJunchao Zhang   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
493cd620004SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
494cd620004SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
4953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
496cd620004SJunchao Zhang }
497cd620004SJunchao Zhang 
498d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetInUse(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata, PetscCopyMode cmode, PetscSFLink *mylink)
499d71ae5a4SJacob Faibussowitsch {
500cd620004SJunchao Zhang   PetscSFLink    link, *p;
50140e23c03SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
50240e23c03SJunchao Zhang 
50340e23c03SJunchao Zhang   PetscFunctionBegin;
50440e23c03SJunchao Zhang   /* Look for types in cache */
50540e23c03SJunchao Zhang   for (p = &bas->inuse; (link = *p); p = &link->next) {
50640e23c03SJunchao Zhang     PetscBool match;
5079566063dSJacob Faibussowitsch     PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
508637e6665SJunchao Zhang     if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
50940e23c03SJunchao Zhang       switch (cmode) {
510d71ae5a4SJacob Faibussowitsch       case PETSC_OWN_POINTER:
511d71ae5a4SJacob Faibussowitsch         *p = link->next;
512d71ae5a4SJacob Faibussowitsch         break; /* Remove from inuse list */
513d71ae5a4SJacob Faibussowitsch       case PETSC_USE_POINTER:
514d71ae5a4SJacob Faibussowitsch         break;
515d71ae5a4SJacob Faibussowitsch       default:
516d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "invalid cmode");
51740e23c03SJunchao Zhang       }
51840e23c03SJunchao Zhang       *mylink = link;
5193ba16761SJacob Faibussowitsch       PetscFunctionReturn(PETSC_SUCCESS);
52040e23c03SJunchao Zhang     }
52140e23c03SJunchao Zhang   }
52240e23c03SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Could not find pack");
52340e23c03SJunchao Zhang }
52440e23c03SJunchao Zhang 
525d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkReclaim(PetscSF sf, PetscSFLink *mylink)
526d71ae5a4SJacob Faibussowitsch {
52740e23c03SJunchao Zhang   PetscSF_Basic *bas  = (PetscSF_Basic *)sf->data;
52871438e86SJunchao Zhang   PetscSFLink    link = *mylink;
52940e23c03SJunchao Zhang 
53040e23c03SJunchao Zhang   PetscFunctionBegin;
53171438e86SJunchao Zhang   link->rootdata = NULL;
53271438e86SJunchao Zhang   link->leafdata = NULL;
53371438e86SJunchao Zhang   link->next     = bas->avail;
53471438e86SJunchao Zhang   bas->avail     = link;
53571438e86SJunchao Zhang   *mylink        = NULL;
5363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
537eb02082bSJunchao Zhang }
538eb02082bSJunchao Zhang 
5399d1c8addSJunchao Zhang /* Error out on unsupported overlapped communications */
540d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf, MPI_Datatype unit, const void *rootdata, const void *leafdata)
541d71ae5a4SJacob Faibussowitsch {
542cd620004SJunchao Zhang   PetscSFLink    link, *p;
5439d1c8addSJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
5449d1c8addSJunchao Zhang   PetscBool      match;
5459d1c8addSJunchao Zhang 
5469d1c8addSJunchao Zhang   PetscFunctionBegin;
547b458e8f1SJose E. Roman   if (PetscDefined(USE_DEBUG)) {
54818fb5014SJunchao Zhang     /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
54918fb5014SJunchao Zhang        the potential overlapping since this process does not participate in communication. Overlapping is harmless.
55018fb5014SJunchao Zhang     */
551637e6665SJunchao Zhang     if (rootdata || leafdata) {
5529d1c8addSJunchao Zhang       for (p = &bas->inuse; (link = *p); p = &link->next) {
5539566063dSJacob Faibussowitsch         PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
554c9cc58a2SBarry Smith         PetscCheck(!match || rootdata != link->rootdata || leafdata != link->leafdata, PETSC_COMM_SELF, PETSC_ERR_SUP, "Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.", rootdata, leafdata);
5559d1c8addSJunchao Zhang       }
55618fb5014SJunchao Zhang     }
557b458e8f1SJose E. Roman   }
5583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5599d1c8addSJunchao Zhang }
5609d1c8addSJunchao Zhang 
561d71ae5a4SJacob Faibussowitsch static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link, PetscMemType dstmtype, void *dst, PetscMemType srcmtype, const void *src, size_t n)
562d71ae5a4SJacob Faibussowitsch {
56320c24465SJunchao Zhang   PetscFunctionBegin;
5649566063dSJacob Faibussowitsch   if (n) PetscCall(PetscMemcpy(dst, src, n));
5653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
56620c24465SJunchao Zhang }
56720c24465SJunchao Zhang 
568d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf, PetscSFLink link, MPI_Datatype unit)
569d71ae5a4SJacob Faibussowitsch {
570b23bfdefSJunchao Zhang   PetscInt    nSignedChar = 0, nUnsignedChar = 0, nInt = 0, nPetscInt = 0, nPetscReal = 0;
571b23bfdefSJunchao Zhang   PetscBool   is2Int, is2PetscInt;
57240e23c03SJunchao Zhang   PetscMPIInt ni, na, nd, combiner;
57340e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
574b23bfdefSJunchao Zhang   PetscInt nPetscComplex = 0;
57540e23c03SJunchao Zhang #endif
57640e23c03SJunchao Zhang 
57740e23c03SJunchao Zhang   PetscFunctionBegin;
5789566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_SIGNED_CHAR, &nSignedChar));
5799566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_UNSIGNED_CHAR, &nUnsignedChar));
580b23bfdefSJunchao Zhang   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
5819566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPI_INT, &nInt));
5829566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_INT, &nPetscInt));
5839566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_REAL, &nPetscReal));
58440e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
5859566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare_contig(unit, MPIU_COMPLEX, &nPetscComplex));
58640e23c03SJunchao Zhang #endif
5879566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPI_2INT, &is2Int));
5889566063dSJacob Faibussowitsch   PetscCall(MPIPetsc_Type_compare(unit, MPIU_2INT, &is2PetscInt));
589b23bfdefSJunchao Zhang   /* TODO: shaell we also handle Fortran MPI_2REAL? */
5909566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Type_get_envelope(unit, &ni, &na, &nd, &combiner));
5915ad15460SJunchao Zhang   link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
592b23bfdefSJunchao Zhang   link->bs        = 1;                                                           /* default */
59340e23c03SJunchao Zhang 
594eb02082bSJunchao Zhang   if (is2Int) {
595cd620004SJunchao Zhang     PackInit_PairType_int_int_1_1(link);
596eb02082bSJunchao Zhang     link->bs        = 1;
597eb02082bSJunchao Zhang     link->unitbytes = 2 * sizeof(int);
5985ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
599eb02082bSJunchao Zhang     link->basicunit = MPI_2INT;
6005ad15460SJunchao Zhang     link->unit      = MPI_2INT;
601eb02082bSJunchao Zhang   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
602cd620004SJunchao Zhang     PackInit_PairType_PetscInt_PetscInt_1_1(link);
603eb02082bSJunchao Zhang     link->bs        = 1;
604eb02082bSJunchao Zhang     link->unitbytes = 2 * sizeof(PetscInt);
605eb02082bSJunchao Zhang     link->basicunit = MPIU_2INT;
6065ad15460SJunchao Zhang     link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
6075ad15460SJunchao Zhang     link->unit      = MPIU_2INT;
608eb02082bSJunchao Zhang   } else if (nPetscReal) {
6099371c9d4SSatish Balay     if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link);
6109371c9d4SSatish Balay     else if (nPetscReal % 8 == 0) PackInit_RealType_PetscReal_8_0(link);
6119371c9d4SSatish Balay     else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link);
6129371c9d4SSatish Balay     else if (nPetscReal % 4 == 0) PackInit_RealType_PetscReal_4_0(link);
6139371c9d4SSatish Balay     else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link);
6149371c9d4SSatish Balay     else if (nPetscReal % 2 == 0) PackInit_RealType_PetscReal_2_0(link);
6159371c9d4SSatish Balay     else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link);
6169371c9d4SSatish Balay     else if (nPetscReal % 1 == 0) PackInit_RealType_PetscReal_1_0(link);
617b23bfdefSJunchao Zhang     link->bs        = nPetscReal;
618eb02082bSJunchao Zhang     link->unitbytes = nPetscReal * sizeof(PetscReal);
61940e23c03SJunchao Zhang     link->basicunit = MPIU_REAL;
6209371c9d4SSatish Balay     if (link->bs == 1) {
6219371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6229371c9d4SSatish Balay       link->unit      = MPIU_REAL;
6239371c9d4SSatish Balay     }
624b23bfdefSJunchao Zhang   } else if (nPetscInt) {
6259371c9d4SSatish Balay     if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link);
6269371c9d4SSatish Balay     else if (nPetscInt % 8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
6279371c9d4SSatish Balay     else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link);
6289371c9d4SSatish Balay     else if (nPetscInt % 4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
6299371c9d4SSatish Balay     else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link);
6309371c9d4SSatish Balay     else if (nPetscInt % 2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
6319371c9d4SSatish Balay     else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link);
6329371c9d4SSatish Balay     else if (nPetscInt % 1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
633b23bfdefSJunchao Zhang     link->bs        = nPetscInt;
634eb02082bSJunchao Zhang     link->unitbytes = nPetscInt * sizeof(PetscInt);
635b23bfdefSJunchao Zhang     link->basicunit = MPIU_INT;
6369371c9d4SSatish Balay     if (link->bs == 1) {
6379371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6389371c9d4SSatish Balay       link->unit      = MPIU_INT;
6399371c9d4SSatish Balay     }
640b23bfdefSJunchao Zhang #if defined(PETSC_USE_64BIT_INDICES)
641b23bfdefSJunchao Zhang   } else if (nInt) {
6429371c9d4SSatish Balay     if (nInt == 8) PackInit_IntegerType_int_8_1(link);
6439371c9d4SSatish Balay     else if (nInt % 8 == 0) PackInit_IntegerType_int_8_0(link);
6449371c9d4SSatish Balay     else if (nInt == 4) PackInit_IntegerType_int_4_1(link);
6459371c9d4SSatish Balay     else if (nInt % 4 == 0) PackInit_IntegerType_int_4_0(link);
6469371c9d4SSatish Balay     else if (nInt == 2) PackInit_IntegerType_int_2_1(link);
6479371c9d4SSatish Balay     else if (nInt % 2 == 0) PackInit_IntegerType_int_2_0(link);
6489371c9d4SSatish Balay     else if (nInt == 1) PackInit_IntegerType_int_1_1(link);
6499371c9d4SSatish Balay     else if (nInt % 1 == 0) PackInit_IntegerType_int_1_0(link);
650b23bfdefSJunchao Zhang     link->bs        = nInt;
651eb02082bSJunchao Zhang     link->unitbytes = nInt * sizeof(int);
652b23bfdefSJunchao Zhang     link->basicunit = MPI_INT;
6539371c9d4SSatish Balay     if (link->bs == 1) {
6549371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6559371c9d4SSatish Balay       link->unit      = MPI_INT;
6569371c9d4SSatish Balay     }
657b23bfdefSJunchao Zhang #endif
658b23bfdefSJunchao Zhang   } else if (nSignedChar) {
6599371c9d4SSatish Balay     if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link);
6609371c9d4SSatish Balay     else if (nSignedChar % 8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
6619371c9d4SSatish Balay     else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link);
6629371c9d4SSatish Balay     else if (nSignedChar % 4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
6639371c9d4SSatish Balay     else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link);
6649371c9d4SSatish Balay     else if (nSignedChar % 2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
6659371c9d4SSatish Balay     else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link);
6669371c9d4SSatish Balay     else if (nSignedChar % 1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
667b23bfdefSJunchao Zhang     link->bs        = nSignedChar;
668eb02082bSJunchao Zhang     link->unitbytes = nSignedChar * sizeof(SignedChar);
669b23bfdefSJunchao Zhang     link->basicunit = MPI_SIGNED_CHAR;
6709371c9d4SSatish Balay     if (link->bs == 1) {
6719371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6729371c9d4SSatish Balay       link->unit      = MPI_SIGNED_CHAR;
6739371c9d4SSatish Balay     }
674b23bfdefSJunchao Zhang   } else if (nUnsignedChar) {
6759371c9d4SSatish Balay     if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link);
6769371c9d4SSatish Balay     else if (nUnsignedChar % 8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
6779371c9d4SSatish Balay     else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link);
6789371c9d4SSatish Balay     else if (nUnsignedChar % 4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
6799371c9d4SSatish Balay     else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link);
6809371c9d4SSatish Balay     else if (nUnsignedChar % 2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
6819371c9d4SSatish Balay     else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link);
6829371c9d4SSatish Balay     else if (nUnsignedChar % 1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
683b23bfdefSJunchao Zhang     link->bs        = nUnsignedChar;
684eb02082bSJunchao Zhang     link->unitbytes = nUnsignedChar * sizeof(UnsignedChar);
685b23bfdefSJunchao Zhang     link->basicunit = MPI_UNSIGNED_CHAR;
6869371c9d4SSatish Balay     if (link->bs == 1) {
6879371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
6889371c9d4SSatish Balay       link->unit      = MPI_UNSIGNED_CHAR;
6899371c9d4SSatish Balay     }
69040e23c03SJunchao Zhang #if defined(PETSC_HAVE_COMPLEX)
691b23bfdefSJunchao Zhang   } else if (nPetscComplex) {
6929371c9d4SSatish Balay     if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link);
6939371c9d4SSatish Balay     else if (nPetscComplex % 8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
6949371c9d4SSatish Balay     else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link);
6959371c9d4SSatish Balay     else if (nPetscComplex % 4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
6969371c9d4SSatish Balay     else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link);
6979371c9d4SSatish Balay     else if (nPetscComplex % 2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
6989371c9d4SSatish Balay     else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link);
6999371c9d4SSatish Balay     else if (nPetscComplex % 1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
700b23bfdefSJunchao Zhang     link->bs        = nPetscComplex;
701eb02082bSJunchao Zhang     link->unitbytes = nPetscComplex * sizeof(PetscComplex);
70240e23c03SJunchao Zhang     link->basicunit = MPIU_COMPLEX;
7039371c9d4SSatish Balay     if (link->bs == 1) {
7049371c9d4SSatish Balay       link->isbuiltin = PETSC_TRUE;
7059371c9d4SSatish Balay       link->unit      = MPIU_COMPLEX;
7069371c9d4SSatish Balay     }
70740e23c03SJunchao Zhang #endif
70840e23c03SJunchao Zhang   } else {
709b23bfdefSJunchao Zhang     MPI_Aint lb, nbyte;
7109566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Type_get_extent(unit, &lb, &nbyte));
71108401ef6SPierre Jolivet     PetscCheck(lb == 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Datatype with nonzero lower bound %ld", (long)lb);
712eb02082bSJunchao Zhang     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
7139371c9d4SSatish Balay       if (nbyte == 4) PackInit_DumbType_char_4_1(link);
7149371c9d4SSatish Balay       else if (nbyte % 4 == 0) PackInit_DumbType_char_4_0(link);
7159371c9d4SSatish Balay       else if (nbyte == 2) PackInit_DumbType_char_2_1(link);
7169371c9d4SSatish Balay       else if (nbyte % 2 == 0) PackInit_DumbType_char_2_0(link);
7179371c9d4SSatish Balay       else if (nbyte == 1) PackInit_DumbType_char_1_1(link);
7189371c9d4SSatish Balay       else if (nbyte % 1 == 0) PackInit_DumbType_char_1_0(link);
719eb02082bSJunchao Zhang       link->bs        = nbyte;
720b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
721b23bfdefSJunchao Zhang       link->basicunit = MPI_BYTE;
72240e23c03SJunchao Zhang     } else {
723eb02082bSJunchao Zhang       nInt = nbyte / sizeof(int);
7249371c9d4SSatish Balay       if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link);
7259371c9d4SSatish Balay       else if (nInt % 8 == 0) PackInit_DumbType_DumbInt_8_0(link);
7269371c9d4SSatish Balay       else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link);
7279371c9d4SSatish Balay       else if (nInt % 4 == 0) PackInit_DumbType_DumbInt_4_0(link);
7289371c9d4SSatish Balay       else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link);
7299371c9d4SSatish Balay       else if (nInt % 2 == 0) PackInit_DumbType_DumbInt_2_0(link);
7309371c9d4SSatish Balay       else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link);
7319371c9d4SSatish Balay       else if (nInt % 1 == 0) PackInit_DumbType_DumbInt_1_0(link);
732eb02082bSJunchao Zhang       link->bs        = nInt;
733b23bfdefSJunchao Zhang       link->unitbytes = nbyte;
73440e23c03SJunchao Zhang       link->basicunit = MPI_INT;
73540e23c03SJunchao Zhang     }
7365ad15460SJunchao Zhang     if (link->isbuiltin) link->unit = unit;
73740e23c03SJunchao Zhang   }
738b23bfdefSJunchao Zhang 
7399566063dSJacob Faibussowitsch   if (!link->isbuiltin) PetscCallMPI(MPI_Type_dup(unit, &link->unit));
74020c24465SJunchao Zhang 
74120c24465SJunchao Zhang   link->Memcpy = PetscSFLinkMemcpy_Host;
7423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
74340e23c03SJunchao Zhang }
74440e23c03SJunchao Zhang 
745d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *))
746d71ae5a4SJacob Faibussowitsch {
74740e23c03SJunchao Zhang   PetscFunctionBegin;
74840e23c03SJunchao Zhang   *UnpackAndOp = NULL;
74971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
75083df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
751eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
752eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
753eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
754eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
755eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
756eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
757eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
758eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
759eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
760eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
761eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
762eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
763eb02082bSJunchao Zhang   }
7647fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
76571438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
76683df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
767eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
768eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
769eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
770eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
771eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
772eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
773eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
774eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
775eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
776eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
777eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
778eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
77971438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
78083df288dSJunchao Zhang     if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
781eb02082bSJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
782eb02082bSJunchao Zhang     else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
783eb02082bSJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
784eb02082bSJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
785eb02082bSJunchao Zhang     else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
786eb02082bSJunchao Zhang     else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
787eb02082bSJunchao Zhang     else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
788eb02082bSJunchao Zhang     else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
789eb02082bSJunchao Zhang     else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
790eb02082bSJunchao Zhang     else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
791eb02082bSJunchao Zhang     else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
792eb02082bSJunchao Zhang     else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
793eb02082bSJunchao Zhang   }
794eb02082bSJunchao Zhang #endif
7953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
79640e23c03SJunchao Zhang }
79740e23c03SJunchao Zhang 
798d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *))
799d71ae5a4SJacob Faibussowitsch {
80040e23c03SJunchao Zhang   PetscFunctionBegin;
801cd620004SJunchao Zhang   *ScatterAndOp = NULL;
80271438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) {
80383df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
804cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
805cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
806cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
807cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
808cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
809cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
810cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
811cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
812cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
813cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
814cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
815cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
816eb02082bSJunchao Zhang   }
8177fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
81871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) {
81983df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
820cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
821cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
822cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
823cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
824cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
825cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
826cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
827cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
828cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
829cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
830cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
831cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
83271438e86SJunchao Zhang   } else if (PetscMemTypeDevice(mtype) && atomic) {
83383df288dSJunchao Zhang     if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
834cd620004SJunchao Zhang     else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
835cd620004SJunchao Zhang     else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
836cd620004SJunchao Zhang     else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
837cd620004SJunchao Zhang     else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
838cd620004SJunchao Zhang     else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
839cd620004SJunchao Zhang     else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
840cd620004SJunchao Zhang     else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
841cd620004SJunchao Zhang     else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
842cd620004SJunchao Zhang     else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
843cd620004SJunchao Zhang     else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
844cd620004SJunchao Zhang     else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
845cd620004SJunchao Zhang     else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
846eb02082bSJunchao Zhang   }
847eb02082bSJunchao Zhang #endif
8483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
849cd620004SJunchao Zhang }
850cd620004SJunchao Zhang 
851d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *))
852d71ae5a4SJacob Faibussowitsch {
853cd620004SJunchao Zhang   PetscFunctionBegin;
854cd620004SJunchao Zhang   *FetchAndOp = NULL;
855c9cc58a2SBarry Smith   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
85671438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
8577fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
85871438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
85971438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
860cd620004SJunchao Zhang #endif
8613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
862cd620004SJunchao Zhang }
863cd620004SJunchao Zhang 
864d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link, PetscMemType mtype, MPI_Op op, PetscBool atomic, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
865d71ae5a4SJacob Faibussowitsch {
866cd620004SJunchao Zhang   PetscFunctionBegin;
867cd620004SJunchao Zhang   *FetchAndOpLocal = NULL;
868c9cc58a2SBarry Smith   PetscCheck(op == MPI_SUM || op == MPIU_SUM, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for MPI_Op in FetchAndOp");
86971438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
8707fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
87171438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
87271438e86SJunchao Zhang   else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
873cd620004SJunchao Zhang #endif
8743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
875cd620004SJunchao Zhang }
876cd620004SJunchao Zhang 
877d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
878d71ae5a4SJacob Faibussowitsch {
879cd620004SJunchao Zhang   PetscLogDouble flops;
880cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
881cd620004SJunchao Zhang 
882cd620004SJunchao Zhang   PetscFunctionBegin;
88383df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
884cd620004SJunchao Zhang     flops = bas->rootbuflen[scope] * link->bs;               /* # of roots in buffer x # of scalars in unit */
8857fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
8869371c9d4SSatish Balay     if (PetscMemTypeDevice(link->rootmtype)) PetscCall(PetscLogGpuFlops(flops));
8879371c9d4SSatish Balay     else
888cd620004SJunchao Zhang #endif
8899566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(flops));
890cd620004SJunchao Zhang   }
8913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
892cd620004SJunchao Zhang }
893cd620004SJunchao Zhang 
894d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, MPI_Op op)
895d71ae5a4SJacob Faibussowitsch {
896cd620004SJunchao Zhang   PetscLogDouble flops;
897cd620004SJunchao Zhang 
898cd620004SJunchao Zhang   PetscFunctionBegin;
89983df288dSJunchao Zhang   if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
900cd620004SJunchao Zhang     flops = sf->leafbuflen[scope] * link->bs;                /* # of roots in buffer x # of scalars in unit */
9017fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
9029371c9d4SSatish Balay     if (PetscMemTypeDevice(link->leafmtype)) PetscCall(PetscLogGpuFlops(flops));
9039371c9d4SSatish Balay     else
904cd620004SJunchao Zhang #endif
9059566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(flops));
906cd620004SJunchao Zhang   }
9073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
908cd620004SJunchao Zhang }
909cd620004SJunchao Zhang 
910cd620004SJunchao Zhang /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
9114165533cSJose E. Roman   Input Parameters:
912cd620004SJunchao Zhang   +sf      - The StarForest
913cd620004SJunchao Zhang   .link    - The link
914cd620004SJunchao Zhang   .count   - Number of entries to unpack
915145b44c9SPierre Jolivet   .start   - The first index, significant when indices=NULL
916cd620004SJunchao Zhang   .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
917cd620004SJunchao Zhang   .buf     - A contiguous buffer to unpack from
918cd620004SJunchao Zhang   -op      - Operation after unpack
919cd620004SJunchao Zhang 
9204165533cSJose E. Roman   Output Parameters:
921cd620004SJunchao Zhang   .data    - The data to unpack to
922cd620004SJunchao Zhang */
923d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt start, const PetscInt *indices, void *data, const void *buf, MPI_Op op)
924d71ae5a4SJacob Faibussowitsch {
925cd620004SJunchao Zhang   PetscFunctionBegin;
926cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
927cd620004SJunchao Zhang   {
928cd620004SJunchao Zhang     PetscInt i;
929cd620004SJunchao Zhang     if (indices) {
930cd620004SJunchao Zhang       /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
931cd620004SJunchao Zhang          basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
932cd620004SJunchao Zhang       */
9339566063dSJacob Faibussowitsch       for (i = 0; i < count; i++) PetscCallMPI(MPI_Reduce_local((const char *)buf + i * link->unitbytes, (char *)data + indices[i] * link->unitbytes, 1, link->unit, op));
934cd620004SJunchao Zhang     } else {
9359566063dSJacob Faibussowitsch       PetscCallMPI(MPIU_Reduce_local(buf, (char *)data + start * link->unitbytes, count, link->unit, op));
936cd620004SJunchao Zhang     }
937cd620004SJunchao Zhang   }
9383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
939cd620004SJunchao Zhang #else
940cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
941cd620004SJunchao Zhang #endif
942cd620004SJunchao Zhang }
943cd620004SJunchao Zhang 
944d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf, PetscSFLink link, PetscInt count, PetscInt srcStart, const PetscInt *srcIdx, const void *src, PetscInt dstStart, const PetscInt *dstIdx, void *dst, MPI_Op op)
945d71ae5a4SJacob Faibussowitsch {
946cd620004SJunchao Zhang   PetscFunctionBegin;
947cd620004SJunchao Zhang #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
948cd620004SJunchao Zhang   {
949cd620004SJunchao Zhang     PetscInt i, disp;
950fcc7397dSJunchao Zhang     if (!srcIdx) {
9519566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, dstStart, dstIdx, dst, (const char *)src + srcStart * link->unitbytes, op));
952fcc7397dSJunchao Zhang     } else {
953cd620004SJunchao Zhang       for (i = 0; i < count; i++) {
954fcc7397dSJunchao Zhang         disp = dstIdx ? dstIdx[i] : dstStart + i;
9559566063dSJacob Faibussowitsch         PetscCallMPI(MPIU_Reduce_local((const char *)src + srcIdx[i] * link->unitbytes, (char *)dst + disp * link->unitbytes, 1, link->unit, op));
956fcc7397dSJunchao Zhang       }
957cd620004SJunchao Zhang     }
958cd620004SJunchao Zhang   }
9593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
960cd620004SJunchao Zhang #else
961cd620004SJunchao Zhang   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No unpacking reduction operation for this MPI_Op");
962cd620004SJunchao Zhang #endif
963cd620004SJunchao Zhang }
964cd620004SJunchao Zhang 
965cd620004SJunchao Zhang /*=============================================================================
966cd620004SJunchao Zhang               Pack/Unpack/Fetch/Scatter routines
967cd620004SJunchao Zhang  ============================================================================*/
968cd620004SJunchao Zhang 
969cd620004SJunchao Zhang /* Pack rootdata to rootbuf
9704165533cSJose E. Roman   Input Parameters:
971cd620004SJunchao Zhang   + sf       - The SF this packing works on.
972cd620004SJunchao Zhang   . link     - It gives the memtype of the roots and also provides root buffer.
973cd620004SJunchao Zhang   . scope    - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
974cd620004SJunchao Zhang   - rootdata - Where to read the roots.
975cd620004SJunchao Zhang 
976cd620004SJunchao Zhang   Notes:
977cd620004SJunchao Zhang   When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
97871438e86SJunchao Zhang   in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
979cd620004SJunchao Zhang  */
98066976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
981d71ae5a4SJacob Faibussowitsch {
982cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
983cd620004SJunchao Zhang   PetscInt        count, start;
984fcc7397dSJunchao Zhang   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
985cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                        = link->rootmtype;
986fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
987fcc7397dSJunchao Zhang 
988cd620004SJunchao Zhang   PetscFunctionBegin;
9899566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
99071438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
9919566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
9929566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link, rootmtype, &Pack));
9939566063dSJacob Faibussowitsch     PetscCall((*Pack)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
994cd620004SJunchao Zhang   }
9959566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
9963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
997cd620004SJunchao Zhang }
998cd620004SJunchao Zhang 
999cd620004SJunchao Zhang /* Pack leafdata to leafbuf */
100066976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1001d71ae5a4SJacob Faibussowitsch {
1002cd620004SJunchao Zhang   const PetscInt *leafindices = NULL;
1003cd620004SJunchao Zhang   PetscInt        count, start;
1004fcc7397dSJunchao Zhang   PetscErrorCode (*Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1005cd620004SJunchao Zhang   PetscMemType   leafmtype                                                                                        = link->leafmtype;
1006fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
1007cd620004SJunchao Zhang 
1008cd620004SJunchao Zhang   PetscFunctionBegin;
10099566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
101071438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
10119566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
10129566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetPack(link, leafmtype, &Pack));
10139566063dSJacob Faibussowitsch     PetscCall((*Pack)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1014cd620004SJunchao Zhang   }
10159566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
10163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1017cd620004SJunchao Zhang }
1018cd620004SJunchao Zhang 
101971438e86SJunchao Zhang /* Pack rootdata to rootbuf, which are in the same memory space */
1020d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkPackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *rootdata)
1021d71ae5a4SJacob Faibussowitsch {
102271438e86SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
102371438e86SJunchao Zhang 
102471438e86SJunchao Zhang   PetscFunctionBegin;
102571438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
10269566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
10279566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
102871438e86SJunchao Zhang   }
10299566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
10309566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkPackRootData_Private(sf, link, scope, rootdata));
10319566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
10323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
103371438e86SJunchao Zhang }
103471438e86SJunchao Zhang /* Pack leafdata to leafbuf, which are in the same memory space */
1035d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, const void *leafdata)
1036d71ae5a4SJacob Faibussowitsch {
103771438e86SJunchao Zhang   PetscFunctionBegin;
103871438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
10399566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
10409566063dSJacob Faibussowitsch     if (link->PrePack) PetscCall((*link->PrePack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
104171438e86SJunchao Zhang   }
10429566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Pack, sf, 0, 0, 0));
10439566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkPackLeafData_Private(sf, link, scope, leafdata));
10449566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Pack, sf, 0, 0, 0));
10453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
104671438e86SJunchao Zhang }
104771438e86SJunchao Zhang 
104866976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1049d71ae5a4SJacob Faibussowitsch {
1050cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
1051cd620004SJunchao Zhang   PetscInt        count, start;
1052cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                    = (PetscSF_Basic *)sf->data;
1053fcc7397dSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1054cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                               = link->rootmtype;
1055fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                                     = NULL;
1056cd620004SJunchao Zhang 
1057cd620004SJunchao Zhang   PetscFunctionBegin;
105871438e86SJunchao Zhang   if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
10599566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link, rootmtype, op, bas->rootdups[scope], &UnpackAndOp));
1060cd620004SJunchao Zhang     if (UnpackAndOp) {
10619566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, scope, &count, &start, &opt, &rootindices));
10629566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[scope][rootmtype]));
1063cd620004SJunchao Zhang     } else {
10649566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &rootindices));
10659566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, rootindices, rootdata, link->rootbuf[scope][rootmtype], op));
1066cd620004SJunchao Zhang     }
1067cd620004SJunchao Zhang   }
10689566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, scope, op));
10693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1070cd620004SJunchao Zhang }
1071cd620004SJunchao Zhang 
107266976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1073d71ae5a4SJacob Faibussowitsch {
1074cd620004SJunchao Zhang   const PetscInt *leafindices = NULL;
1075cd620004SJunchao Zhang   PetscInt        count, start;
1076fcc7397dSJunchao Zhang   PetscErrorCode (*UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *) = NULL;
1077cd620004SJunchao Zhang   PetscMemType   leafmtype                                                                                               = link->leafmtype;
1078fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                                     = NULL;
1079cd620004SJunchao Zhang 
1080cd620004SJunchao Zhang   PetscFunctionBegin;
108171438e86SJunchao Zhang   if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
10829566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetUnpackAndOp(link, leafmtype, op, sf->leafdups[scope], &UnpackAndOp));
1083cd620004SJunchao Zhang     if (UnpackAndOp) {
10849566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, scope, &count, &start, &opt, &leafindices));
10859566063dSJacob Faibussowitsch       PetscCall((*UnpackAndOp)(link, count, start, opt, leafindices, leafdata, link->leafbuf[scope][leafmtype]));
1086cd620004SJunchao Zhang     } else {
10879566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, scope, &count, &start, &opt, &leafindices));
10889566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackDataWithMPIReduceLocal(sf, link, count, start, leafindices, leafdata, link->leafbuf[scope][leafmtype], op));
1089cd620004SJunchao Zhang     }
1090cd620004SJunchao Zhang   }
10919566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackLeafData(sf, link, scope, op));
10923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
109371438e86SJunchao Zhang }
109471438e86SJunchao Zhang /* Unpack rootbuf to rootdata, which are in the same memory space */
1095d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *rootdata, MPI_Op op)
1096d71ae5a4SJacob Faibussowitsch {
109771438e86SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
109871438e86SJunchao Zhang 
109971438e86SJunchao Zhang   PetscFunctionBegin;
11009566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
11019566063dSJacob Faibussowitsch   if (bas->rootbuflen[scope]) PetscCall(PetscSFLinkUnpackRootData_Private(sf, link, scope, rootdata, op));
11029566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
110371438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
11049566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_LEAF2ROOT)); /* Used by SF nvshmem */
11059566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
110671438e86SJunchao Zhang   }
11073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1108cd620004SJunchao Zhang }
1109cd620004SJunchao Zhang 
111071438e86SJunchao Zhang /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1111d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf, PetscSFLink link, PetscSFScope scope, void *leafdata, MPI_Op op)
1112d71ae5a4SJacob Faibussowitsch {
111371438e86SJunchao Zhang   PetscFunctionBegin;
11149566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
11159566063dSJacob Faibussowitsch   if (sf->leafbuflen[scope]) PetscCall(PetscSFLinkUnpackLeafData_Private(sf, link, scope, leafdata, op));
11169566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
111771438e86SJunchao Zhang   if (scope == PETSCSF_REMOTE) {
11189566063dSJacob Faibussowitsch     if (link->PostUnpack) PetscCall((*link->PostUnpack)(sf, link, PETSCSF_ROOT2LEAF)); /* Used by SF nvshmem */
11199566063dSJacob Faibussowitsch     if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) PetscCall((*link->SyncDevice)(link));
112071438e86SJunchao Zhang   }
11213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112271438e86SJunchao Zhang }
112371438e86SJunchao Zhang 
112471438e86SJunchao Zhang /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1125d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf, PetscSFLink link, void *rootdata, MPI_Op op)
1126d71ae5a4SJacob Faibussowitsch {
1127cd620004SJunchao Zhang   const PetscInt *rootindices = NULL;
1128cd620004SJunchao Zhang   PetscInt        count, start;
1129cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                             = (PetscSF_Basic *)sf->data;
1130fcc7397dSJunchao Zhang   PetscErrorCode (*FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *) = NULL;
1131cd620004SJunchao Zhang   PetscMemType   rootmtype                                                                                        = link->rootmtype;
1132fcc7397dSJunchao Zhang   PetscSFPackOpt opt                                                                                              = NULL;
1133cd620004SJunchao Zhang 
1134cd620004SJunchao Zhang   PetscFunctionBegin;
11359566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(PETSCSF_Unpack, sf, 0, 0, 0));
113671438e86SJunchao Zhang   if (bas->rootbuflen[PETSCSF_REMOTE]) {
1137cd620004SJunchao Zhang     /* Do FetchAndOp on rootdata with rootbuf */
11389566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOp(link, rootmtype, op, bas->rootdups[PETSCSF_REMOTE], &FetchAndOp));
11399566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_REMOTE, &count, &start, &opt, &rootindices));
11409566063dSJacob Faibussowitsch     PetscCall((*FetchAndOp)(link, count, start, opt, rootindices, rootdata, link->rootbuf[PETSCSF_REMOTE][rootmtype]));
1141cd620004SJunchao Zhang   }
11429566063dSJacob Faibussowitsch   PetscCall(PetscSFLinkLogFlopsAfterUnpackRootData(sf, link, PETSCSF_REMOTE, op));
11439566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(PETSCSF_Unpack, sf, 0, 0, 0));
11443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1145cd620004SJunchao Zhang }
1146cd620004SJunchao Zhang 
1147d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void *rootdata, void *leafdata, MPI_Op op)
1148d71ae5a4SJacob Faibussowitsch {
1149cd620004SJunchao Zhang   const PetscInt *rootindices = NULL, *leafindices = NULL;
1150cd620004SJunchao Zhang   PetscInt        count, rootstart, leafstart;
1151cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                                                                 = (PetscSF_Basic *)sf->data;
1152fcc7397dSJunchao Zhang   PetscErrorCode (*ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *) = NULL;
115371438e86SJunchao Zhang   PetscMemType   rootmtype = link->rootmtype, leafmtype = link->leafmtype, srcmtype, dstmtype;
1154fcc7397dSJunchao Zhang   PetscSFPackOpt leafopt = NULL, rootopt = NULL;
115571438e86SJunchao Zhang   PetscInt       buflen = sf->leafbuflen[PETSCSF_LOCAL];
115671438e86SJunchao Zhang   char          *srcbuf = NULL, *dstbuf = NULL;
115771438e86SJunchao Zhang   PetscBool      dstdups;
1158cd620004SJunchao Zhang 
1159362febeeSStefano Zampini   PetscFunctionBegin;
11603ba16761SJacob Faibussowitsch   if (!buflen) PetscFunctionReturn(PETSC_SUCCESS);
116171438e86SJunchao Zhang   if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
116271438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
11639566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackRootData(sf, link, PETSCSF_LOCAL, rootdata));
116471438e86SJunchao Zhang       srcmtype = rootmtype;
116571438e86SJunchao Zhang       srcbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
116671438e86SJunchao Zhang       dstmtype = leafmtype;
116771438e86SJunchao Zhang       dstbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
116871438e86SJunchao Zhang     } else {
11699566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkPackLeafData(sf, link, PETSCSF_LOCAL, leafdata));
117071438e86SJunchao Zhang       srcmtype = leafmtype;
117171438e86SJunchao Zhang       srcbuf   = link->leafbuf[PETSCSF_LOCAL][leafmtype];
117271438e86SJunchao Zhang       dstmtype = rootmtype;
117371438e86SJunchao Zhang       dstbuf   = link->rootbuf[PETSCSF_LOCAL][rootmtype];
117471438e86SJunchao Zhang     }
11759566063dSJacob Faibussowitsch     PetscCall((*link->Memcpy)(link, dstmtype, dstbuf, srcmtype, srcbuf, buflen * link->unitbytes));
117671438e86SJunchao Zhang     /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
11779566063dSJacob Faibussowitsch     if (PetscMemTypeHost(dstmtype)) PetscCall((*link->SyncStream)(link));
117871438e86SJunchao Zhang     if (direction == PETSCSF_ROOT2LEAF) {
11799566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackLeafData(sf, link, PETSCSF_LOCAL, leafdata, op));
1180cd620004SJunchao Zhang     } else {
11819566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkUnpackRootData(sf, link, PETSCSF_LOCAL, rootdata, op));
118271438e86SJunchao Zhang     }
118371438e86SJunchao Zhang   } else {
118471438e86SJunchao Zhang     dstdups  = (direction == PETSCSF_ROOT2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
118571438e86SJunchao Zhang     dstmtype = (direction == PETSCSF_ROOT2LEAF) ? link->leafmtype : link->rootmtype;
11869566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetScatterAndOp(link, dstmtype, op, dstdups, &ScatterAndOp));
1187cd620004SJunchao Zhang     if (ScatterAndOp) {
11889566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11899566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
119071438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
11919566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata));
1192cd620004SJunchao Zhang       } else {
11939566063dSJacob Faibussowitsch         PetscCall((*ScatterAndOp)(link, count, leafstart, leafopt, leafindices, leafdata, rootstart, rootopt, rootindices, rootdata));
119471438e86SJunchao Zhang       }
1195cd620004SJunchao Zhang     } else {
11969566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
11979566063dSJacob Faibussowitsch       PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, PETSC_MEMTYPE_HOST, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
119871438e86SJunchao Zhang       if (direction == PETSCSF_ROOT2LEAF) {
11999566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, rootstart, rootindices, rootdata, leafstart, leafindices, leafdata, op));
120071438e86SJunchao Zhang       } else {
12019566063dSJacob Faibussowitsch         PetscCall(PetscSFLinkScatterDataWithMPIReduceLocal(sf, link, count, leafstart, leafindices, leafdata, rootstart, rootindices, rootdata, op));
1202cd620004SJunchao Zhang       }
1203cd620004SJunchao Zhang     }
120471438e86SJunchao Zhang   }
12053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1206cd620004SJunchao Zhang }
1207cd620004SJunchao Zhang 
1208cd620004SJunchao Zhang /* Fetch rootdata to leafdata and leafupdate locally */
1209d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf, PetscSFLink link, void *rootdata, const void *leafdata, void *leafupdate, MPI_Op op)
1210d71ae5a4SJacob Faibussowitsch {
1211cd620004SJunchao Zhang   const PetscInt *rootindices = NULL, *leafindices = NULL;
1212cd620004SJunchao Zhang   PetscInt        count, rootstart, leafstart;
1213cd620004SJunchao Zhang   PetscSF_Basic  *bas                                                                                                                                                            = (PetscSF_Basic *)sf->data;
1214fcc7397dSJunchao Zhang   PetscErrorCode (*FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *) = NULL;
1215cd620004SJunchao Zhang   const PetscMemType rootmtype = link->rootmtype, leafmtype = link->leafmtype;
1216fcc7397dSJunchao Zhang   PetscSFPackOpt     leafopt = NULL, rootopt = NULL;
1217cd620004SJunchao Zhang 
1218cd620004SJunchao Zhang   PetscFunctionBegin;
12193ba16761SJacob Faibussowitsch   if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(PETSC_SUCCESS);
1220cd620004SJunchao Zhang   if (rootmtype != leafmtype) {
1221cd620004SJunchao Zhang     /* The local communication has to go through pack and unpack */
1222cd620004SJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1223cd620004SJunchao Zhang   } else {
12249566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetRootPackOptAndIndices(sf, link, rootmtype, PETSCSF_LOCAL, &count, &rootstart, &rootopt, &rootindices));
12259566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetLeafPackOptAndIndices(sf, link, leafmtype, PETSCSF_LOCAL, &count, &leafstart, &leafopt, &leafindices));
12269566063dSJacob Faibussowitsch     PetscCall(PetscSFLinkGetFetchAndOpLocal(link, rootmtype, op, bas->rootdups[PETSCSF_LOCAL], &FetchAndOpLocal));
12279566063dSJacob Faibussowitsch     PetscCall((*FetchAndOpLocal)(link, count, rootstart, rootopt, rootindices, rootdata, leafstart, leafopt, leafindices, leafdata, leafupdate));
1228cd620004SJunchao Zhang   }
12293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
123040e23c03SJunchao Zhang }
123140e23c03SJunchao Zhang 
123240e23c03SJunchao Zhang /*
1233*511e6246SStefano Zampini   Create per-rank pack/unpack optimizations based on indices patterns
123440e23c03SJunchao Zhang 
123540e23c03SJunchao Zhang    Input Parameters:
1236fcc7397dSJunchao Zhang   +  n       - Number of destination ranks
1237eb02082bSJunchao Zhang   .  offset  - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1238b23bfdefSJunchao Zhang   -  idx     - [*]   Array storing indices
123940e23c03SJunchao Zhang 
124040e23c03SJunchao Zhang    Output Parameters:
1241cd620004SJunchao Zhang   +  opt     - Pack optimizations. NULL if no optimizations.
124240e23c03SJunchao Zhang */
124366976f2fSJacob Faibussowitsch static PetscErrorCode PetscSFCreatePackOpt(PetscInt n, const PetscInt *offset, const PetscInt *idx, PetscSFPackOpt *out)
1244d71ae5a4SJacob Faibussowitsch {
1245fcc7397dSJunchao Zhang   PetscInt       r, p, start, i, j, k, dx, dy, dz, dydz, m, X, Y;
1246fcc7397dSJunchao Zhang   PetscBool      optimizable = PETSC_TRUE;
124740e23c03SJunchao Zhang   PetscSFPackOpt opt;
124840e23c03SJunchao Zhang 
124940e23c03SJunchao Zhang   PetscFunctionBegin;
12509566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(1, &opt));
12519566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(7 * n + 2, &opt->array));
1252fcc7397dSJunchao Zhang   opt->n = opt->array[0] = n;
1253fcc7397dSJunchao Zhang   opt->offset            = opt->array + 1;
1254fcc7397dSJunchao Zhang   opt->start             = opt->array + n + 2;
1255fcc7397dSJunchao Zhang   opt->dx                = opt->array + 2 * n + 2;
1256fcc7397dSJunchao Zhang   opt->dy                = opt->array + 3 * n + 2;
1257fcc7397dSJunchao Zhang   opt->dz                = opt->array + 4 * n + 2;
1258fcc7397dSJunchao Zhang   opt->X                 = opt->array + 5 * n + 2;
1259fcc7397dSJunchao Zhang   opt->Y                 = opt->array + 6 * n + 2;
1260fcc7397dSJunchao Zhang 
1261fcc7397dSJunchao Zhang   for (r = 0; r < n; r++) {            /* For each destination rank */
1262fcc7397dSJunchao Zhang     m     = offset[r + 1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1263fcc7397dSJunchao Zhang     p     = offset[r];
1264fcc7397dSJunchao Zhang     start = idx[p]; /* First index for this rank */
1265fcc7397dSJunchao Zhang     p++;
1266fcc7397dSJunchao Zhang 
1267fcc7397dSJunchao Zhang     /* Search in X dimension */
1268fcc7397dSJunchao Zhang     for (dx = 1; dx < m; dx++, p++) {
1269fcc7397dSJunchao Zhang       if (start + dx != idx[p]) break;
1270b23bfdefSJunchao Zhang     }
1271b23bfdefSJunchao Zhang 
1272fcc7397dSJunchao Zhang     dydz = m / dx;
1273fcc7397dSJunchao Zhang     X    = dydz > 1 ? (idx[p] - start) : dx;
1274fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
12759371c9d4SSatish Balay     if (m % dx || X <= 0) {
12769371c9d4SSatish Balay       optimizable = PETSC_FALSE;
12779371c9d4SSatish Balay       goto finish;
12789371c9d4SSatish Balay     }
1279fcc7397dSJunchao Zhang     for (dy = 1; dy < dydz; dy++) { /* Search in Y dimension */
1280fcc7397dSJunchao Zhang       for (i = 0; i < dx; i++, p++) {
1281fcc7397dSJunchao Zhang         if (start + X * dy + i != idx[p]) {
12829371c9d4SSatish Balay           if (i) {
12839371c9d4SSatish Balay             optimizable = PETSC_FALSE;
12849371c9d4SSatish Balay             goto finish;
12859371c9d4SSatish Balay           } /* The pattern is violated in the middle of an x-walk */
12869371c9d4SSatish Balay           else
12879371c9d4SSatish Balay             goto Z_dimension;
128840e23c03SJunchao Zhang         }
128940e23c03SJunchao Zhang       }
129040e23c03SJunchao Zhang     }
129140e23c03SJunchao Zhang 
1292fcc7397dSJunchao Zhang   Z_dimension:
1293fcc7397dSJunchao Zhang     dz = m / (dx * dy);
1294fcc7397dSJunchao Zhang     Y  = dz > 1 ? (idx[p] - start) / X : dy;
1295fcc7397dSJunchao Zhang     /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
12969371c9d4SSatish Balay     if (m % (dx * dy) || Y <= 0) {
12979371c9d4SSatish Balay       optimizable = PETSC_FALSE;
12989371c9d4SSatish Balay       goto finish;
12999371c9d4SSatish Balay     }
1300fcc7397dSJunchao Zhang     for (k = 1; k < dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1301fcc7397dSJunchao Zhang       for (j = 0; j < dy; j++) {
1302fcc7397dSJunchao Zhang         for (i = 0; i < dx; i++, p++) {
13039371c9d4SSatish Balay           if (start + X * Y * k + X * j + i != idx[p]) {
13049371c9d4SSatish Balay             optimizable = PETSC_FALSE;
13059371c9d4SSatish Balay             goto finish;
13069371c9d4SSatish Balay           }
130740e23c03SJunchao Zhang         }
130840e23c03SJunchao Zhang       }
130940e23c03SJunchao Zhang     }
1310fcc7397dSJunchao Zhang     opt->start[r] = start;
1311fcc7397dSJunchao Zhang     opt->dx[r]    = dx;
1312fcc7397dSJunchao Zhang     opt->dy[r]    = dy;
1313fcc7397dSJunchao Zhang     opt->dz[r]    = dz;
1314fcc7397dSJunchao Zhang     opt->X[r]     = X;
1315fcc7397dSJunchao Zhang     opt->Y[r]     = Y;
131640e23c03SJunchao Zhang   }
131740e23c03SJunchao Zhang 
1318fcc7397dSJunchao Zhang finish:
1319fcc7397dSJunchao Zhang   /* If not optimizable, free arrays to save memory */
1320fcc7397dSJunchao Zhang   if (!n || !optimizable) {
13219566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt->array));
13229566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
132340e23c03SJunchao Zhang     *out = NULL;
1324fcc7397dSJunchao Zhang   } else {
1325fcc7397dSJunchao Zhang     opt->offset[0] = 0;
1326fcc7397dSJunchao Zhang     for (r = 0; r < n; r++) opt->offset[r + 1] = opt->offset[r] + opt->dx[r] * opt->dy[r] * opt->dz[r];
1327fcc7397dSJunchao Zhang     *out = opt;
1328fcc7397dSJunchao Zhang   }
13293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
133040e23c03SJunchao Zhang }
133140e23c03SJunchao Zhang 
1332d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf, PetscMemType mtype, PetscSFPackOpt *out)
1333d71ae5a4SJacob Faibussowitsch {
133440e23c03SJunchao Zhang   PetscSFPackOpt opt = *out;
133540e23c03SJunchao Zhang 
133640e23c03SJunchao Zhang   PetscFunctionBegin;
133740e23c03SJunchao Zhang   if (opt) {
13389566063dSJacob Faibussowitsch     PetscCall(PetscSFFree(sf, mtype, opt->array));
13399566063dSJacob Faibussowitsch     PetscCall(PetscFree(opt));
134040e23c03SJunchao Zhang     *out = NULL;
134140e23c03SJunchao Zhang   }
13423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
134340e23c03SJunchao Zhang }
1344cd620004SJunchao Zhang 
1345d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1346d71ae5a4SJacob Faibussowitsch {
1347cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1348cd620004SJunchao Zhang   PetscInt       i, j;
1349cd620004SJunchao Zhang 
1350cd620004SJunchao Zhang   PetscFunctionBegin;
1351cd620004SJunchao Zhang   /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1352cd620004SJunchao Zhang   for (i = 0; i < 2; i++) { /* Set defaults */
1353cd620004SJunchao Zhang     sf->leafstart[i]   = 0;
1354cd620004SJunchao Zhang     sf->leafcontig[i]  = PETSC_TRUE;
1355cd620004SJunchao Zhang     sf->leafdups[i]    = PETSC_FALSE;
1356cd620004SJunchao Zhang     bas->rootstart[i]  = 0;
1357cd620004SJunchao Zhang     bas->rootcontig[i] = PETSC_TRUE;
1358cd620004SJunchao Zhang     bas->rootdups[i]   = PETSC_FALSE;
1359cd620004SJunchao Zhang   }
1360cd620004SJunchao Zhang 
1361cd620004SJunchao Zhang   sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1362cd620004SJunchao Zhang   sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1363cd620004SJunchao Zhang 
1364cd620004SJunchao Zhang   if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1365cd620004SJunchao Zhang   if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1366cd620004SJunchao Zhang 
1367cd620004SJunchao Zhang   /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1368cd620004SJunchao Zhang   for (i = 0; i < sf->roffset[sf->ndranks]; i++) { /* self */
13699371c9d4SSatish Balay     if (sf->rmine[i] != sf->leafstart[0] + i) {
13709371c9d4SSatish Balay       sf->leafcontig[0] = PETSC_FALSE;
13719371c9d4SSatish Balay       break;
13729371c9d4SSatish Balay     }
1373cd620004SJunchao Zhang   }
1374cd620004SJunchao Zhang   for (i = sf->roffset[sf->ndranks], j = 0; i < sf->roffset[sf->nranks]; i++, j++) { /* remote */
13759371c9d4SSatish Balay     if (sf->rmine[i] != sf->leafstart[1] + j) {
13769371c9d4SSatish Balay       sf->leafcontig[1] = PETSC_FALSE;
13779371c9d4SSatish Balay       break;
13789371c9d4SSatish Balay     }
1379cd620004SJunchao Zhang   }
1380cd620004SJunchao Zhang 
1381cd620004SJunchao Zhang   /* If not, see if we can have per-rank optimizations by doing index analysis */
13829566063dSJacob Faibussowitsch   if (!sf->leafcontig[0]) PetscCall(PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]));
13839566063dSJacob Faibussowitsch   if (!sf->leafcontig[1]) PetscCall(PetscSFCreatePackOpt(sf->nranks - sf->ndranks, sf->roffset + sf->ndranks, sf->rmine, &sf->leafpackopt[1]));
1384cd620004SJunchao Zhang 
1385cd620004SJunchao Zhang   /* Are root indices for self and remote contiguous? */
1386cd620004SJunchao Zhang   bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1387cd620004SJunchao Zhang   bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1388cd620004SJunchao Zhang 
1389cd620004SJunchao Zhang   if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1390cd620004SJunchao Zhang   if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1391cd620004SJunchao Zhang 
1392cd620004SJunchao Zhang   for (i = 0; i < bas->ioffset[bas->ndiranks]; i++) {
13939371c9d4SSatish Balay     if (bas->irootloc[i] != bas->rootstart[0] + i) {
13949371c9d4SSatish Balay       bas->rootcontig[0] = PETSC_FALSE;
13959371c9d4SSatish Balay       break;
13969371c9d4SSatish Balay     }
1397cd620004SJunchao Zhang   }
1398cd620004SJunchao Zhang   for (i = bas->ioffset[bas->ndiranks], j = 0; i < bas->ioffset[bas->niranks]; i++, j++) {
13999371c9d4SSatish Balay     if (bas->irootloc[i] != bas->rootstart[1] + j) {
14009371c9d4SSatish Balay       bas->rootcontig[1] = PETSC_FALSE;
14019371c9d4SSatish Balay       break;
14029371c9d4SSatish Balay     }
1403cd620004SJunchao Zhang   }
1404cd620004SJunchao Zhang 
14059566063dSJacob Faibussowitsch   if (!bas->rootcontig[0]) PetscCall(PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]));
14069566063dSJacob Faibussowitsch   if (!bas->rootcontig[1]) PetscCall(PetscSFCreatePackOpt(bas->niranks - bas->ndiranks, bas->ioffset + bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]));
1407cd620004SJunchao Zhang 
1408cd620004SJunchao Zhang   /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1409013b3241SStefano Zampini   if (PetscDefined(HAVE_DEVICE)) {
1410013b3241SStefano Zampini     PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
14119566063dSJacob Faibussowitsch     if (!sf->leafcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]));
14129566063dSJacob Faibussowitsch     if (!sf->leafcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine + sf->roffset[sf->ndranks], &sf->leafdups[1]));
14139566063dSJacob Faibussowitsch     if (!bas->rootcontig[0] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]));
14149566063dSJacob Faibussowitsch     if (!bas->rootcontig[1] && !ismulti) PetscCall(PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc + bas->ioffset[bas->ndiranks], &bas->rootdups[1]));
1415013b3241SStefano Zampini   }
14163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1417cd620004SJunchao Zhang }
1418cd620004SJunchao Zhang 
1419d71ae5a4SJacob Faibussowitsch PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1420d71ae5a4SJacob Faibussowitsch {
1421cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
1422cd620004SJunchao Zhang   PetscInt       i;
1423cd620004SJunchao Zhang 
1424cd620004SJunchao Zhang   PetscFunctionBegin;
1425cd620004SJunchao Zhang   for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
14269566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &sf->leafpackopt[i]));
14279566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_HOST, &bas->rootpackopt[i]));
14287fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
14299566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &sf->leafpackopt_d[i]));
14309566063dSJacob Faibussowitsch     PetscCall(PetscSFDestroyPackOpt(sf, PETSC_MEMTYPE_DEVICE, &bas->rootpackopt_d[i]));
14317fd2d3dbSJunchao Zhang #endif
1432cd620004SJunchao Zhang   }
14333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1434cd620004SJunchao Zhang }
1435